From 8e5abfebf0d17b17767271baf6d3f5b7a15c3cd1 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Thu, 7 Dec 2023 14:29:29 +0000 Subject: [PATCH] Add truffle SVE implementation Signed-off-by: Yoan Picchi --- src/nfa/arm/truffle.hpp | 72 +++++++++++ src/nfa/truffle.cpp | 12 ++ src/nfa/truffle_simd.hpp | 180 +++++++++++++++++++++++++-- src/util/arch/arm/match.hpp | 62 +++++++++ src/util/supervector/supervector.hpp | 2 +- 5 files changed, 314 insertions(+), 14 deletions(-) diff --git a/src/nfa/arm/truffle.hpp b/src/nfa/arm/truffle.hpp index 92333261..73eee3e0 100644 --- a/src/nfa/arm/truffle.hpp +++ b/src/nfa/arm/truffle.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -32,6 +33,76 @@ * */ +#ifdef HAVE_SVE + +/* + * blockSingleMask takes in a character set (as masks) and a string and return for each character + * of the string weither or not it is part of the set. + * + * 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit + * represents whether or not a character is in the character set. The 'highclear' and + * 'highset' in the name refers to the MSb of the byte of the character (allowing two + * 128-bit masks to cover all 256 values). + * + * The masks are arrays of 16 bytes each and are encoded this way: + * Let C be a character in the set. The bit describing that character is at byte[C%16] and + * within that byte, it's at bit[C/16] + * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x40 0x00 0x00 0x00 ... + * + * Assume both mask are 128b wide. If they are larger, the additional bits must be zero + */ +static really_inline +svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { + + const svuint8_t highconst = svdup_u8(0x80); + const svuint8_t pshub_mask = svdup_u8(0x8f); + const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201)); + + /* + * svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes + * in shuf_mask_lo_highclear and saves the result in the corresponding byte of byte_select_low. + * We mask the chars so that we are using the low nibble of char as the index but we keep the + * MSb so that high characters (not represented by the highclear mask) become an index out of + * bounds and result in a 0. + */ + svuint8_t byte_select_low = svtbl(shuf_mask_lo_highclear, svand_x(svptrue_b8(), chars, pshub_mask)); + + /* + * We flip the MSb of the chars and do the same table lookup with the highset mask. + * This way it's the characters with MSb cleared that will result in out of bands indexes. + * This allows us to cover the full range (0-127 and 128-255) + */ + svuint8_t char_high_flipped = sveor_x(svptrue_b8(), chars, highconst); + svuint8_t byte_select_high = svtbl(shuf_mask_lo_highset, svand_x(svptrue_b8(), char_high_flipped, pshub_mask)); + + /* + * We now have selected the byte that contain the bit corresponding to the char. We need to + * further filter it, otherwise we'd get a match for any character % 16 to a searched character + * + * The low nibble was used previously to select the byte out of the mask. The high nibble is + * used to select the bit out of the byte. So we shift everything right by 4. + * + * Using svtbl, we can make an array where each element is a different bit. Using the high + * nibble we can get a mask selecting only the bit out of a byte that may have the relevant + * charset char. + */ + svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 4); + svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble); + /* + * For every lane, only one of the byte selected may have a value, so we can OR them. We + * then apply the bit_select mask. What is left is the bit in the charset encoding the + * character in char. A non zero value means the char was in the charset + * + * The _x suffix only works if we process a full char vector. If we were to use a partial + * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled + * lanes may have arbitrary values + */ + svuint8_t res = svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select); + + return res; +} +#else + template static really_inline const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { @@ -60,3 +131,4 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe return !res.eq(SuperVector::Zeroes()); } +#endif //HAVE_SVE \ No newline at end of file diff --git a/src/nfa/truffle.cpp b/src/nfa/truffle.cpp index c8391445..1e783284 100644 --- a/src/nfa/truffle.cpp +++ b/src/nfa/truffle.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2020, 2021, VectorCamp PC + * Copyright (c) 2023, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,7 +38,17 @@ #include "util/bitutils.h" #include "truffle_simd.hpp" +#ifdef HAVE_SVE +const u8 *truffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + return truffleExecSVE(mask_lo, mask_hi, buf, buf_end); +} +const u8 *rtruffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + return rtruffleExecSVE(mask_lo, mask_hi, buf, buf_end); +} +#else const u8 *truffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { return truffleExecReal(mask_lo, mask_hi, buf, buf_end); @@ -47,3 +58,4 @@ const u8 *rtruffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { return rtruffleExecReal(mask_lo, mask_hi, buf, buf_end); } +#endif //HAVE_SVE diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index c1028156..f7dbc6bb 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2020-2023, VectorCamp PC + * Copyright (c) 2023, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,9 +42,14 @@ #include "util/supervector/supervector.hpp" #include "util/match.hpp" +#ifdef HAVE_SVE +static really_inline +svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars); +#else template static really_inline const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars); +#endif //HAVE_SVE #if defined(VS_SIMDE_BACKEND) #include "x86/truffle.hpp" @@ -57,6 +63,162 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe #endif #endif +#ifdef HAVE_SVE + +const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end); + +const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end); + +static really_inline +const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars, const u8 *buf, bool forward) { + + const size_t vector_size_int_8 = svcntb(); + + const svuint8_t result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + uint64_t index; + if (forward) { + index = first_non_zero(vector_size_int_8, result_mask); + } else { + index = last_non_zero(vector_size_int_8, result_mask); + } + + if(index < vector_size_int_8) { + return buf+index; + } else { + return NULL; + } +} + +really_inline +const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { + const int vect_size_int8 = svcntb(); + // Activate only 16 lanes to read the m128 buffers + const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear); + svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset); + + const u8 *work_buffer = buf; + const u8 *ret_val; + + DEBUG_PRINTF("start %p end %p \n", work_buffer, buf_end); + assert(work_buffer < buf_end); + + __builtin_prefetch(work_buffer + 16*64); + + if (work_buffer + vect_size_int8 <= buf_end) { + // Reach vector aligned boundaries + DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(work_buffer, vect_size_int8)); + if (!ISALIGNED_N(work_buffer, vect_size_int8)) { + svuint8_t chars = svld1(svptrue_b8(), work_buffer); + const u8 *alligned_buffer = ROUNDUP_PTR(work_buffer, vect_size_int8); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true); + if (ret_val && ret_val < alligned_buffer) return ret_val; + work_buffer = alligned_buffer; + } + + while(work_buffer + vect_size_int8 <= buf_end) { + __builtin_prefetch(work_buffer + 16*64); + DEBUG_PRINTF("work_buffer %p \n", work_buffer); + svuint8_t chars = svld1(svptrue_b8(), work_buffer); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true); + if (ret_val) return ret_val; + work_buffer += vect_size_int8; + } + } + + DEBUG_PRINTF("work_buffer %p e %p \n", work_buffer, buf_end); + // finish off tail + + if (work_buffer != buf_end) { + svuint8_t chars; + const u8* end_buf; + if (buf_end - buf < vect_size_int8) { + const svbool_t remaining_lanes = svwhilelt_b8(0ll, buf_end - buf); + chars = svld1(remaining_lanes, buf); + end_buf = buf; + } else { + chars = svld1(svptrue_b8(), buf_end - vect_size_int8); + end_buf = buf_end - vect_size_int8; + } + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, true); + DEBUG_PRINTF("ret_val %p \n", ret_val); + if (ret_val && ret_val < buf_end) return ret_val; + } + + return buf_end; +} + +really_inline +const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ + const int vect_size_int8 = svcntb(); + // Activate only 16 lanes to read the m128 buffers + const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear); + svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset); + + const u8 *work_buffer = buf_end; + const u8 *ret_val; + + DEBUG_PRINTF("start %p end %p \n", buf, work_buffer); + assert(work_buffer > buf); + + __builtin_prefetch(work_buffer - 16*64); + + if (work_buffer - vect_size_int8 >= buf) { + // Reach vector aligned boundaries + DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(work_buffer, vect_size_int8)); + if (!ISALIGNED_N(work_buffer, vect_size_int8)) { + svuint8_t chars = svld1(svptrue_b8(), work_buffer - vect_size_int8); + const u8 *alligned_buffer = ROUNDDOWN_PTR(work_buffer, vect_size_int8); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, false); + DEBUG_PRINTF("ret_val %p \n", ret_val); + if (ret_val >= alligned_buffer) return ret_val; + work_buffer = alligned_buffer; + } + + while (work_buffer - vect_size_int8 >= buf) { + DEBUG_PRINTF("aligned %p \n", work_buffer); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(work_buffer - 16*64); + + work_buffer -= vect_size_int8; + svuint8_t chars = svld1(svptrue_b8(), work_buffer); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, false); + if (ret_val) return ret_val; + } + } + + DEBUG_PRINTF("tail work_buffer %p e %p \n", buf, work_buffer); + // finish off head + + if (work_buffer != buf) { + svuint8_t chars; + if (buf_end - buf < vect_size_int8) { + const svbool_t remaining_lanes = svwhilele_b8(0ll, buf_end - buf); + chars = svld1(remaining_lanes, buf); + } else { + chars = svld1(svptrue_b8(), buf); + } + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, false); + DEBUG_PRINTF("ret_val %p \n", ret_val); + if (ret_val && ret_val < buf_end) return ret_val; + } + + return buf - 1; +} +#else template static really_inline const u8 *fwdBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars, const u8 *buf) { @@ -77,13 +239,7 @@ const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highse const u8 *d = buf; const u8 *rv; - DEBUG_PRINTF("start %p end %p \n", d, buf_end); - assert(d < buf_end); - - __builtin_prefetch(d + 64); - __builtin_prefetch(d + 2*64); - __builtin_prefetch(d + 3*64); - __builtin_prefetch(d + 4*64); + __builtin_prefetch(d + 16*64); DEBUG_PRINTF("start %p end %p \n", d, buf_end); assert(d < buf_end); if (d + S <= buf_end) { @@ -98,7 +254,7 @@ const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highse } while(d + S <= buf_end) { - __builtin_prefetch(d + 64); + __builtin_prefetch(d + 16*64); DEBUG_PRINTF("d %p \n", d); SuperVector chars = SuperVector::load(d); rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); @@ -149,10 +305,7 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse const u8 *d = buf_end; const u8 *rv; - __builtin_prefetch(d - 64); - __builtin_prefetch(d - 2*64); - __builtin_prefetch(d - 3*64); - __builtin_prefetch(d - 4*64); + __builtin_prefetch(d - 16*64); DEBUG_PRINTF("start %p end %p \n", buf, d); assert(d > buf); if (d - S >= buf) { @@ -170,7 +323,7 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse while (d - S >= buf) { DEBUG_PRINTF("aligned %p \n", d); // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d - 64); + __builtin_prefetch(d - 16*64); d -= S; SuperVector chars = SuperVector::load(d); @@ -196,3 +349,4 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse return buf - 1; } +#endif //HAVE_SVE \ No newline at end of file diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp index 1280fed5..afeedc0f 100644 --- a/src/util/arch/arm/match.hpp +++ b/src/util/arch/arm/match.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -99,3 +100,64 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 } } +#ifdef HAVE_SVE + + +static really_inline +uint64_t last_non_zero_real(svuint8_t mask) { + const svuint64_t leading_zeros = svclz_x(svptrue_b64(), svreinterpret_u64(mask)); + + uint64_t last_active_lane; + + svbool_t remaining_mask = svptrue_b64(); + uint64_t i = 0; + while(svptest_any(svptrue_b64(), remaining_mask)) { + svbool_t single_lane_mask = svpnext_b64(remaining_mask, svpfalse()); + remaining_mask = sveor_z(svptrue_b64(), remaining_mask, single_lane_mask); + uint64_t active_element = svlastb(single_lane_mask, leading_zeros); + if(active_element<64) { + uint64_t lane_index = (i+1)*8 - (active_element/8) - 1; + last_active_lane = lane_index; + } + i++; + } + return last_active_lane; +} + +/* + * It is assumed mask have the value 0 for all inactive lanes, if any. + */ +static really_inline +uint64_t last_non_zero(const size_t vector_size_int_8, svuint8_t mask) { + const svbool_t result_pred = svcmpne(svptrue_b8(), mask, 0); + + if (svptest_any(svptrue_b8(), result_pred)) { + return last_non_zero_real(mask); + } else { + return vector_size_int_8; + } +} + +/* + * It is assumed mask have the value 0 for all inactive lanes, if any. + */ +static really_inline +uint64_t first_non_zero(const size_t vector_size_int_8, svuint8_t mask) { + const svbool_t result_pred = svcmpne(svptrue_b8(), mask, 0); + + if (svptest_any(svptrue_b8(), result_pred)) { + + // We don't have a CTZ instruction but we can work around by reversing the lane order + const svuint64_t rev_large_res = svreinterpret_u64(svrev(mask)); + // Now each pack of 8 leading 0 means one empty lane. So if we have 18 leading 0, + // that means the third lane have a matching character. + uint64_t first_active_lane = last_non_zero_real(svreinterpret_u8(rev_large_res)); + // We reversed the lanes, so we reverse back the index + first_active_lane = (vector_size_int_8-1) - first_active_lane; + return first_active_lane; + } else { + return vector_size_int_8; + } +} + +#endif //HAVE_SVE \ No newline at end of file diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 253907fa..5e2de235 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -138,7 +138,7 @@ struct BaseVector<64> static constexpr u16 previous_size = 32; }; -// 128 bit implementation +// 256 bit implementation template <> struct BaseVector<32> {