From 938c026256e344521a6dea4e4a6c509ebc907a16 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Tue, 23 Apr 2024 12:04:40 +0000 Subject: [PATCH] Speed up truffle with 256b TBL instructions 256b wide SVE vectors allow some simplification of truffle. Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s onhe microbenchmark. SVE2 also offer this capability for 128b vector with a speedup around 25% compared to normal SVE Add unit tests and benchmark for this wide variant Signed-off-by: Yoan Picchi --- benchmarks/benchmarks.cpp | 65 ++- benchmarks/benchmarks.hpp | 14 +- src/hwlm/hwlm.c | 7 +- src/nfa/accel.c | 11 +- src/nfa/accel.h | 15 +- src/nfa/accel_dfa_build_strat.cpp | 17 +- src/nfa/accel_dump.cpp | 24 +- src/nfa/accelcompile.cpp | 17 +- src/nfa/arm/truffle.hpp | 155 +++++- src/nfa/mcclellandump.cpp | 4 + src/nfa/mcsheng_dump.cpp | 1 + src/nfa/truffle.cpp | 37 +- src/nfa/truffle.h | 8 + src/nfa/truffle_simd.hpp | 99 ++-- src/nfa/trufflecompile.cpp | 29 ++ src/nfa/trufflecompile.h | 6 + src/rose/rose_build_lit_accel.cpp | 15 +- src/util/arch/arm/arm.h | 6 + src/util/arch/arm/simd_types.h | 4 + src/util/supervector/arch/arm/types.hpp | 3 + unit/CMakeLists.txt | 1 + unit/internal/sheng.cpp | 13 +- unit/internal/truffleWide.cpp | 652 ++++++++++++++++++++++++ 23 files changed, 1125 insertions(+), 78 deletions(-) create mode 100644 unit/internal/truffleWide.cpp diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp index 50c5d7fa..c4c93699 100644 --- a/benchmarks/benchmarks.cpp +++ b/benchmarks/benchmarks.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, 2021, VectorCamp PC + * Copyright (c) 2023, 2024, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,6 +35,7 @@ #include #include +#include "util/arch.h" #include "benchmarks.hpp" #define MAX_LOOPS 1000000000 @@ -145,11 +147,13 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::shuftiBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::shuftiBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { - return shuftiExec(b.lo, b.hi, b.buf.data(), + return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(), b.buf.data() + b.size); }); } @@ -160,11 +164,13 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::shuftiBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::shuftiBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { - return rshuftiExec(b.lo, b.hi, b.buf.data(), + return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(), b.buf.data() + b.size); }); } @@ -175,11 +181,13 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::truffleBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::truffleBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { - return truffleExec(b.lo, b.hi, b.buf.data(), + return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(), b.buf.data() + b.size); }); } @@ -190,14 +198,47 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::truffleBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::truffleBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { - return rtruffleExec(b.lo, b.hi, b.buf.data(), + return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(), b.buf.data() + b.size); }); } +#ifdef CAN_USE_WIDE_TRUFFLE + if(CAN_USE_WIDE_TRUFFLE) { + for (size_t i = 0; i < std::size(sizes); i++) { + MicroBenchmark bench("Truffle Wide", sizes[i]); + run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench, + [&](MicroBenchmark &b) { + b.chars.set('a'); + ue2::truffleBuildMasksWide(b.chars, reinterpret_cast(&b.truffle_mask)); + memset(b.buf.data(), 'b', b.size); + }, + [&](MicroBenchmark &b) { + return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size); + } + ); + } + + for (size_t i = 0; i < std::size(sizes); i++) { + MicroBenchmark bench("Reverse Truffle Wide", sizes[i]); + run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench, + [&](MicroBenchmark &b) { + b.chars.set('a'); + ue2::truffleBuildMasksWide(b.chars, reinterpret_cast(&b.truffle_mask)); + memset(b.buf.data(), 'b', b.size); + }, + [&](MicroBenchmark &b) { + return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size); + } + ); + } + } +#endif for (size_t i = 0; i < std::size(sizes); i++) { MicroBenchmark bench("Vermicelli", sizes[i]); @@ -205,7 +246,9 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::truffleBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::truffleBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { @@ -220,7 +263,9 @@ int main(){ sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench, [&](MicroBenchmark &b) { b.chars.set('a'); - ue2::truffleBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + ue2::truffleBuildMasks(b.chars, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi)); memset(b.buf.data(), 'b', b.size); }, [&](MicroBenchmark &b) { diff --git a/benchmarks/benchmarks.hpp b/benchmarks/benchmarks.hpp index 13f66fa5..96874318 100644 --- a/benchmarks/benchmarks.hpp +++ b/benchmarks/benchmarks.hpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, 2021, VectorCamp PC + * Copyright (c) 2024, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -44,7 +45,18 @@ public: size_t size; // Shufti/Truffle - m128 lo, hi; + union { + m256 truffle_mask; + struct { +#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) + m128 truffle_mask_lo; + m128 truffle_mask_hi; +#else + m128 truffle_mask_hi; + m128 truffle_mask_lo; +#endif + }; + }; ue2::CharReach chars; std::vector buf; diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c index 40349def..3d3c0a71 100644 --- a/src/hwlm/hwlm.c +++ b/src/hwlm/hwlm.c @@ -73,7 +73,12 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr, return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end); case ACCEL_TRUFFLE: DEBUG_PRINTF("truffle\n"); - return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end); + return truffleExec(aux->truffle.mask_lo, aux->truffle.mask_hi, ptr, end); +#ifdef CAN_USE_WIDE_TRUFFLE + case ACCEL_TRUFFLE_WIDE: + DEBUG_PRINTF("truffle wide\n"); + return truffleExecWide(aux->truffle.mask, ptr, end); +#endif // CAN_USE_WIDE_TRUFFLE default: /* no acceleration, fall through and return current ptr */ DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type); diff --git a/src/nfa/accel.c b/src/nfa/accel.c index 7661b7a7..027f1182 100644 --- a/src/nfa/accel.c +++ b/src/nfa/accel.c @@ -142,9 +142,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { return c; } - rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end); + rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end); break; +#ifdef CAN_USE_WIDE_TRUFFLE + case ACCEL_TRUFFLE_WIDE: + DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + rv = truffleExecWide(accel->truffle.mask, c, c_end); + break; +#endif case ACCEL_DSHUFTI: DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end); if (c + 15 + 1 >= c_end) { diff --git a/src/nfa/accel.h b/src/nfa/accel.h index 3fccdd7b..646492b3 100644 --- a/src/nfa/accel.h +++ b/src/nfa/accel.h @@ -66,6 +66,7 @@ enum AccelType { ACCEL_VERM16, ACCEL_DVERM16, ACCEL_DVERM16_MASKED, + ACCEL_TRUFFLE_WIDE, }; /** \brief Structure for accel framework. */ @@ -136,8 +137,18 @@ union AccelAux { struct { u8 accel_type; u8 offset; - m128 mask1; - m128 mask2; + union { + m256 mask; + struct { +#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) + m128 mask_lo; + m128 mask_hi; +#else + m128 mask_hi; + m128 mask_lo; +#endif + }; + }; } truffle; }; diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp index 4c72bd31..ef66432c 100644 --- a/src/nfa/accel_dfa_build_strat.cpp +++ b/src/nfa/accel_dfa_build_strat.cpp @@ -576,10 +576,19 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx, } assert(!info.cr.none()); - accel->accel_type = ACCEL_TRUFFLE; - truffleBuildMasks(info.cr, - reinterpret_cast(&accel->truffle.mask1), - reinterpret_cast(&accel->truffle.mask2)); +#if defined(CAN_USE_WIDE_TRUFFLE) + if(CAN_USE_WIDE_TRUFFLE) { + accel->accel_type = ACCEL_TRUFFLE_WIDE; + truffleBuildMasksWide(info.cr, + reinterpret_cast(&accel->truffle.mask)); + } else +#endif + { + accel->accel_type = ACCEL_TRUFFLE; + truffleBuildMasks(info.cr, + reinterpret_cast(&accel->truffle.mask_lo), + reinterpret_cast(&accel->truffle.mask_hi)); + } DEBUG_PRINTF("state %hu is truffle\n", this_idx); } diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp index c2c5e01f..34b16074 100644 --- a/src/nfa/accel_dump.cpp +++ b/src/nfa/accel_dump.cpp @@ -93,6 +93,8 @@ const char *accelName(u8 accel_type) { return "double-shufti"; case ACCEL_TRUFFLE: return "truffle"; + case ACCEL_TRUFFLE_WIDE: + return "truffle wide"; case ACCEL_RED_TAPE: return "red tape"; default: @@ -178,6 +180,13 @@ void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) { describeClass(cr).c_str()); } +static +void dumpWideTruffleCharReach(FILE *f, const u8 *mask) { + CharReach cr = truffle2crWide(mask); + fprintf(f, "count %zu class %s\n", cr.count(), + describeClass(cr).c_str()); +} + static void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) { fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str()); @@ -231,10 +240,17 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) { break; case ACCEL_TRUFFLE: { fprintf(f, "\n"); - dumpTruffleMasks(f, reinterpret_cast(&accel.truffle.mask1), - reinterpret_cast(&accel.truffle.mask2)); - dumpTruffleCharReach(f, reinterpret_cast(&accel.truffle.mask1), - reinterpret_cast(&accel.truffle.mask2)); + dumpTruffleMasks(f, reinterpret_cast(&accel.truffle.mask_lo), + reinterpret_cast(&accel.truffle.mask_hi)); + dumpTruffleCharReach(f, reinterpret_cast(&accel.truffle.mask_lo), + reinterpret_cast(&accel.truffle.mask_hi)); + break; + } + case ACCEL_TRUFFLE_WIDE: { + fprintf(f, "\n"); + dumpTruffleMasks(f, reinterpret_cast(&accel.truffle.mask_lo), + reinterpret_cast(&accel.truffle.mask_hi)); + dumpWideTruffleCharReach(f, reinterpret_cast(&accel.truffle.mask)); break; } default: diff --git a/src/nfa/accelcompile.cpp b/src/nfa/accelcompile.cpp index 5da0df82..b6714051 100644 --- a/src/nfa/accelcompile.cpp +++ b/src/nfa/accelcompile.cpp @@ -97,11 +97,20 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) { if (outs <= ACCEL_MAX_STOP_CHAR) { DEBUG_PRINTF("building Truffle for %zu chars\n", outs); - aux->accel_type = ACCEL_TRUFFLE; aux->truffle.offset = offset; - truffleBuildMasks(info.single_stops, - reinterpret_cast(&aux->truffle.mask1), - reinterpret_cast(&aux->truffle.mask2)); +#if defined(CAN_USE_WIDE_TRUFFLE) + if(CAN_USE_WIDE_TRUFFLE) { + aux->accel_type = ACCEL_TRUFFLE_WIDE; + truffleBuildMasksWide(info.single_stops, + reinterpret_cast(&aux->truffle.mask)); + } else +#endif + { + aux->accel_type = ACCEL_TRUFFLE; + truffleBuildMasks(info.single_stops, + reinterpret_cast(&aux->truffle.mask_lo), + reinterpret_cast(&aux->truffle.mask_hi)); + } return; } diff --git a/src/nfa/arm/truffle.hpp b/src/nfa/arm/truffle.hpp index 73eee3e0..8e0190ec 100644 --- a/src/nfa/arm/truffle.hpp +++ b/src/nfa/arm/truffle.hpp @@ -34,25 +34,82 @@ */ #ifdef HAVE_SVE +#ifdef HAVE_SVE2 /* * blockSingleMask takes in a character set (as masks) and a string and return for each character - * of the string weither or not it is part of the set. + * of the string wether or not it is part of the set. * * 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit * represents whether or not a character is in the character set. The 'highclear' and * 'highset' in the name refers to the MSb of the byte of the character (allowing two * 128-bit masks to cover all 256 values). - * + * + * The mask is an array of 32 bytes and is encoded this way: + * Let C be a character in the set. The bit describing that character is at byte[C%32] and + * within that byte, it's at bit[C/32] + * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ... + * + * Assume the mask is in one of those configurations: + * - both masks are exactly 128b wide + * - the first mask is exactly 256b wide and the second is zeroed. + * - the first mask is more than 256b wide, with bits past the 256th being zero, and the second mask is zeroed. + */ +static really_inline +svuint8_t blockSingleMaskWideSVE2(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { + const svuint8_t pshub_mask = svdup_u8(0x1f); + const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201)); + svuint8x2_t shuf_mask_32 = svcreate2(shuf_mask_lo_highclear, shuf_mask_lo_highset); + /* + * svtbl2 does a table lookup. Each byte in the second argument indexes into the array of bytes + * in shuf_mask_32 and saves the result in the corresponding byte of byte_select. + * We mask the chars so that we are using the low nibble of char as the index. + */ + svuint8_t byte_select = svtbl2(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask)); + + /* + * We now have selected the byte that contain the bit corresponding to the char. We need to + * further filter it, otherwise we'd get a match for any character % 32 to a searched character + * + * The low nibble was used previously to select the byte out of the mask. The high nibble is + * used to select the bit out of the byte. So we shift everything right by 5. + * + * Using svtbl, we can make an array where each element is a different bit. Using the high + * nibble we can get a mask selecting only the bit out of a byte that may have the relevant + * charset char. + */ + svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5); + svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble); + /* + * We apply the bit_select mask onto the selected byte. What is left is the bit in the charset + * encoding the character in char. A non zero value means the char was in the charset + * + * The _x suffix only works if we process a full char vector. If we were to use a partial + * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled + * lanes may have arbitrary values + */ + return svand_x(svptrue_b8(), byte_select, bit_select); +} +#endif //HAVE_SVE2 + +/* + * blockSingleMask takes in a character set (as masks) and a string and return for each character + * of the string wether or not it is part of the set. + * + * 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit + * represents whether or not a character is in the character set. The 'highclear' and + * 'highset' in the name refers to the MSb of the byte of the character (allowing two + * 128-bit masks to cover all 256 values). + * * The masks are arrays of 16 bytes each and are encoded this way: * Let C be a character in the set. The bit describing that character is at byte[C%16] and * within that byte, it's at bit[C/16] * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x40 0x00 0x00 0x00 ... - * + * * Assume both mask are 128b wide. If they are larger, the additional bits must be zero */ static really_inline -svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { +svuint8_t blockSingleMaskSVE(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { const svuint8_t highconst = svdup_u8(0x80); const svuint8_t pshub_mask = svdup_u8(0x8f); @@ -67,7 +124,7 @@ svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_ */ svuint8_t byte_select_low = svtbl(shuf_mask_lo_highclear, svand_x(svptrue_b8(), chars, pshub_mask)); - /* + /* * We flip the MSb of the chars and do the same table lookup with the highset mask. * This way it's the characters with MSb cleared that will result in out of bands indexes. * This allows us to cover the full range (0-127 and 128-255) @@ -78,10 +135,10 @@ svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_ /* * We now have selected the byte that contain the bit corresponding to the char. We need to * further filter it, otherwise we'd get a match for any character % 16 to a searched character - * + * * The low nibble was used previously to select the byte out of the mask. The high nibble is * used to select the bit out of the byte. So we shift everything right by 4. - * + * * Using svtbl, we can make an array where each element is a different bit. Using the high * nibble we can get a mask selecting only the bit out of a byte that may have the relevant * charset char. @@ -92,17 +149,88 @@ svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_ * For every lane, only one of the byte selected may have a value, so we can OR them. We * then apply the bit_select mask. What is left is the bit in the charset encoding the * character in char. A non zero value means the char was in the charset - * + * * The _x suffix only works if we process a full char vector. If we were to use a partial * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled * lanes may have arbitrary values */ - svuint8_t res = svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select); - - return res; + return svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select); } -#else +/* + * blockSingleMask takes in a character set (as masks) and a string and return for each character + * of the string wether or not it is part of the set. + * + * 'shuf_mask_32' is a 256-bit masks where each bit represents whether or not a character is in + * the character set. + * + * The mask is an array of 32 bytes and is encoded this way: + * Let C be a character in the set. The bit describing that character is at byte[C%32] and + * within that byte, it's at bit[C/32] + * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ... + * + * Assume both mask are 128b wide. If they are larger, the additional bits must be zero + */ +static really_inline +svuint8_t blockSingleMaskWideSVE(svuint8_t shuf_mask_32, svuint8_t chars) {//TODO I might have issues with the type + + const svuint8_t pshub_mask = svdup_u8(0x1f); + const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201)); + + /* + * svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes + * in shuf_mask_32 and saves the result in the corresponding byte of byte_select. + * We mask the chars so that we are using the low nibble of char as the index. + */ + svuint8_t byte_select = svtbl(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask)); + + /* + * We now have selected the byte that contain the bit corresponding to the char. We need to + * further filter it, otherwise we'd get a match for any character % 32 to a searched character + * + * The low nibble was used previously to select the byte out of the mask. The high nibble is + * used to select the bit out of the byte. So we shift everything right by 5. + * + * Using svtbl, we can make an array where each element is a different bit. Using the high + * nibble we can get a mask selecting only the bit out of a byte that may have the relevant + * charset char. + */ + svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5); + svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble); + /* + * We apply the bit_select mask onto the selected byte. What is left is the bit in the charset + * encoding the character in char. A non zero value means the char was in the charset + * + * The _x suffix only works if we process a full char vector. If we were to use a partial + * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled + * lanes may have arbitrary values + */ + return svand_x(svptrue_b8(), byte_select, bit_select); +} + +/* require normal truffle compilation. The 256b mask is split between the two parameters */ +static really_inline +svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { + return blockSingleMaskSVE(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); +} + +/* require wide truffle compilation. The 256b mask is fully contained in the first parameter */ +static really_inline +svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars) { + return blockSingleMaskWideSVE(shuf_mask_32, chars); +} + +#ifdef HAVE_SVE2 +/* require wide truffle compilation. The 256b mask is split between the two parameters if the vector is 128b, + * or fully contained in the first parameter is it's 256b and more*/ +static really_inline +svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) { + return blockSingleMaskWideSVE2(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); +} +#endif //HAVE_SVE2 +#endif //HAVE_SVE + +/* require normal truffle compilation. The 256b mask is split between the two parameters */ template static really_inline const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { @@ -115,7 +243,7 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe highconst.print8("highconst"); SuperVector shuf_mask_hi = SuperVector::dup_u64(0x8040201008040201); shuf_mask_hi.print8("shuf_mask_hi"); - + SuperVector shuf1 = shuf_mask_lo_highclear.pshufb(chars); shuf1.print8("shuf1"); SuperVector t1 = chars ^ highconst; @@ -131,4 +259,3 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe return !res.eq(SuperVector::Zeroes()); } -#endif //HAVE_SVE \ No newline at end of file diff --git a/src/nfa/mcclellandump.cpp b/src/nfa/mcclellandump.cpp index 823010f0..071dd85b 100644 --- a/src/nfa/mcclellandump.cpp +++ b/src/nfa/mcclellandump.cpp @@ -181,6 +181,9 @@ void dumpAccelText(FILE *f, const union AccelAux *accel) { case ACCEL_TRUFFLE: fprintf(f, ":M"); break; + case ACCEL_TRUFFLE_WIDE: + fprintf(f, ":MM"); + break; default: fprintf(f, ":??"); break; @@ -200,6 +203,7 @@ void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) { case ACCEL_SHUFTI: case ACCEL_DSHUFTI: case ACCEL_TRUFFLE: + case ACCEL_TRUFFLE_WIDE: fprintf(f, "%u [ color = darkgreen style=diagonals ];\n", i); break; default: diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp index bec4228c..352a9005 100644 --- a/src/nfa/mcsheng_dump.cpp +++ b/src/nfa/mcsheng_dump.cpp @@ -306,6 +306,7 @@ void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) { case ACCEL_SHUFTI: case ACCEL_DSHUFTI: case ACCEL_TRUFFLE: + case ACCEL_TRUFFLE_WIDE: fprintf(f, "%u [ color = darkgreen style=diagonals ];\n", i); break; default: diff --git a/src/nfa/truffle.cpp b/src/nfa/truffle.cpp index 1e783284..df3a4a3f 100644 --- a/src/nfa/truffle.cpp +++ b/src/nfa/truffle.cpp @@ -38,15 +38,48 @@ #include "util/bitutils.h" #include "truffle_simd.hpp" +#ifdef CAN_USE_WIDE_TRUFFLE +#ifdef HAVE_SVE +const u8 *truffleExecWide(m256 mask, const u8 *buf, + const u8 *buf_end) { + if (svcntb() == 16) { + return truffleExecSVE(mask, buf, buf_end); + } else { + return truffleExecSVE(mask, buf, buf_end); + } +} + +const u8 *rtruffleExecWide(m256 mask, const u8 *buf, + const u8 *buf_end) { + if (svcntb() == 16) { + return rtruffleExecSVE(mask, buf, buf_end); + } else { + return rtruffleExecSVE(mask, buf, buf_end); + } +} +#else // HAVE_SVE +#error "Wide truffle enabled for the target architecture but no implementation found" +#endif // HAVE_SVE +#endif // CAN_USE_WIDE_TRUFFLE + + #ifdef HAVE_SVE const u8 *truffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { - return truffleExecSVE(mask_lo, mask_hi, buf, buf_end); + if (svcntb() == 16) { + return truffleExecSVE({.lo = mask_lo, .hi = mask_hi}, buf, buf_end); + } else { + return truffleExecSVE({.lo = mask_lo, .hi = mask_hi}, buf, buf_end); + } } const u8 *rtruffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { - return rtruffleExecSVE(mask_lo, mask_hi, buf, buf_end); + if (svcntb() == 16) { + return rtruffleExecSVE({.lo = mask_lo, .hi = mask_hi}, buf, buf_end); + } else { + return rtruffleExecSVE({.lo = mask_lo, .hi = mask_hi}, buf, buf_end); + } } #else const u8 *truffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf, diff --git a/src/nfa/truffle.h b/src/nfa/truffle.h index f67227ad..2a587d88 100644 --- a/src/nfa/truffle.h +++ b/src/nfa/truffle.h @@ -42,6 +42,14 @@ extern "C" { #endif +#ifdef CAN_USE_WIDE_TRUFFLE +const u8 *truffleExecWide(m256 mask, const u8 *buf, + const u8 *buf_end); + +const u8 *rtruffleExecWide(m256 mask, const u8 *buf, + const u8 *buf_end); +#endif + const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end); diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index e63180d0..61609a39 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -45,6 +45,14 @@ #ifdef HAVE_SVE static really_inline svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars); + +static really_inline +svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars); + +#ifdef HAVE_SVE2 +static really_inline +svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars); +#endif //HAVE_SVE2 #else template static really_inline @@ -64,19 +72,36 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe #endif #ifdef HAVE_SVE - -const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, +template +static really_inline +const u8 *truffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end); -const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, +template +static really_inline +const u8 *rtruffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end); +template static really_inline -const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars, const u8 *buf, bool forward) { - - const size_t vector_size_int_8 = svcntb(); - - const svuint8_t result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); +const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, + svuint8_t chars, const u8 *buf, const size_t vector_size_int_8, bool forward) +{ + svuint8_t result_mask; + if(is_wide) { + if(is_vector_128b) { +#ifdef HAVE_SVE2 + result_mask = blockSingleMaskWide(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); +#else + DEBUG_PRINTF("Wide Truffle is not supported with 128b vectors unless SVE2 is enabled"); + assert(false); +#endif + } else { + result_mask = blockSingleMaskWide32(shuf_mask_lo_highclear, chars); + } + } else { + result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + } uint64_t index; if (forward) { index = first_non_zero(vector_size_int_8, result_mask); @@ -84,25 +109,33 @@ const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_hig index = last_non_zero(vector_size_int_8, result_mask); } - if(index < vector_size_int_8) { + if (index < vector_size_int_8) { return buf+index; } else { return NULL; } } -really_inline -const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { +template +static really_inline +const u8 *truffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end) { const int vect_size_int8 = svcntb(); - // Activate only 16 lanes to read the m128 buffers - const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); assert(buf && buf_end); assert(buf < buf_end); DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf); DEBUG_PRINTF("b %s\n", buf); - svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear); - svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset); + svuint8_t wide_shuf_mask_lo_highclear; + svuint8_t wide_shuf_mask_lo_highset; + if (is_wide && !is_vector_128b) { + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo); + wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */ + } else { + const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); + wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo); + wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi); + } const u8 *work_buffer = buf; const u8 *ret_val; @@ -118,16 +151,16 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, if (!ISALIGNED_N(work_buffer, vect_size_int8)) { svuint8_t chars = svld1(svptrue_b8(), work_buffer); const u8 *alligned_buffer = ROUNDUP_PTR(work_buffer, vect_size_int8); - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true); if (ret_val && ret_val < alligned_buffer) return ret_val; work_buffer = alligned_buffer; } - while(work_buffer + vect_size_int8 <= buf_end) { + while (work_buffer + vect_size_int8 <= buf_end) { __builtin_prefetch(work_buffer + 16*64); DEBUG_PRINTF("work_buffer %p \n", work_buffer); svuint8_t chars = svld1(svptrue_b8(), work_buffer); - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true); if (ret_val) return ret_val; work_buffer += vect_size_int8; } @@ -147,7 +180,7 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, chars = svld1(svptrue_b8(), buf_end - vect_size_int8); end_buf = buf_end - vect_size_int8; } - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, true); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, vect_size_int8, true); DEBUG_PRINTF("ret_val %p \n", ret_val); if (ret_val && ret_val < buf_end) return ret_val; } @@ -155,18 +188,26 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, return buf_end; } -really_inline -const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ +template +static really_inline +const u8 *rtruffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end){ const int vect_size_int8 = svcntb(); - // Activate only 16 lanes to read the m128 buffers - const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); assert(buf && buf_end); assert(buf < buf_end); DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf); DEBUG_PRINTF("b %s\n", buf); - svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear); - svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset); + svuint8_t wide_shuf_mask_lo_highclear; + svuint8_t wide_shuf_mask_lo_highset; + if (is_wide && !is_vector_128b) { + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo); + wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */ + } else { + const svbool_t lane_pred_16 = svwhilelt_b8(0, 16); + wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo); + wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi); + } const u8 *work_buffer = buf_end; const u8 *ret_val; @@ -182,7 +223,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset if (!ISALIGNED_N(work_buffer, vect_size_int8)) { svuint8_t chars = svld1(svptrue_b8(), work_buffer - vect_size_int8); const u8 *alligned_buffer = ROUNDDOWN_PTR(work_buffer, vect_size_int8); - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, false); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, vect_size_int8, false); DEBUG_PRINTF("ret_val %p \n", ret_val); if (ret_val >= alligned_buffer) return ret_val; work_buffer = alligned_buffer; @@ -195,7 +236,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset work_buffer -= vect_size_int8; svuint8_t chars = svld1(svptrue_b8(), work_buffer); - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, false); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, false); if (ret_val) return ret_val; } } @@ -211,7 +252,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset } else { chars = svld1(svptrue_b8(), buf); } - ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, false); + ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, vect_size_int8, false); DEBUG_PRINTF("ret_val %p \n", ret_val); if (ret_val && ret_val < buf_end) return ret_val; } @@ -253,7 +294,7 @@ const u8 *truffleExecReal(const m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_ d = dup; } - while(d + S <= buf_end) { + while (d + S <= buf_end) { __builtin_prefetch(d + 16*64); DEBUG_PRINTF("d %p \n", d); SuperVector chars = SuperVector::load(d); diff --git a/src/nfa/trufflecompile.cpp b/src/nfa/trufflecompile.cpp index f19de0ee..6c411acd 100644 --- a/src/nfa/trufflecompile.cpp +++ b/src/nfa/trufflecompile.cpp @@ -93,4 +93,33 @@ CharReach truffle2cr(const u8 *highclear, const u8 *highset) { return cr; } +void truffleBuildMasksWide(const CharReach &cr, u8 *shuf_mask) { + memset(shuf_mask, 0, 2*sizeof(m128)); + + for (size_t v = cr.find_first(); v != CharReach::npos; + v = cr.find_next(v)) { + DEBUG_PRINTF("adding 0x%02x to shuf_mask\n", (u8)v); + u8 *change_mask = shuf_mask; + u8 low_nibble = v & 0x1f; + u8 bits_567 = (v & 0xe0) >> 5; + change_mask[low_nibble] |= 1 << bits_567; + } +} + +/* + * Reconstruct the charclass that the truffle masks represent + */ +CharReach truffle2crWide(const u8 *shuf_mask) { + CharReach cr; + for (u8 i = 0; i < 32; i++) { + u32 bits_567 = shuf_mask[i]; + while (bits_567) { + u32 pos = findAndClearLSB_32(&bits_567); + assert(pos < 8); + cr.set(pos << 5 | i); + } + } + return cr; +} + } // namespc diff --git a/src/nfa/trufflecompile.h b/src/nfa/trufflecompile.h index 14b314f3..7b25c0ac 100644 --- a/src/nfa/trufflecompile.h +++ b/src/nfa/trufflecompile.h @@ -37,6 +37,12 @@ namespace ue2 { void truffleBuildMasks(const CharReach &cr, u8 *mask1, u8 *mask2); CharReach truffle2cr(const u8 *lo_in, const u8 *hi_in); +/* The wide version uses 5 bits for the Byte index instead of 4. + * It is to be used when TBL can process the whole 256b mask in one instruction + */ +void truffleBuildMasksWide(const CharReach &cr, u8 *mask); +CharReach truffle2crWide(const u8 *mask); + } #endif /* TRUFFLECOMPILE_H */ diff --git a/src/rose/rose_build_lit_accel.cpp b/src/rose/rose_build_lit_accel.cpp index 53968a3b..3ac7a304 100644 --- a/src/rose/rose_build_lit_accel.cpp +++ b/src/rose/rose_build_lit_accel.cpp @@ -461,11 +461,20 @@ void findForwardAccelScheme(const vector &lits, aux->shufti.offset = verify_u8(min_offset); return; } - - truffleBuildMasks(cr, reinterpret_cast(&aux->truffle.mask1), reinterpret_cast(&aux->truffle.mask2)); +#if defined(CAN_USE_WIDE_TRUFFLE) + if(CAN_USE_WIDE_TRUFFLE) { + aux->truffle.accel_type = ACCEL_TRUFFLE_WIDE; + truffleBuildMasksWide(cr, reinterpret_cast(&aux->truffle.mask)); + } else +#endif + { + aux->truffle.accel_type = ACCEL_TRUFFLE; + truffleBuildMasks(cr, + reinterpret_cast(&aux->truffle.mask_lo), + reinterpret_cast(&aux->truffle.mask_hi)); + } DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n", describeClass(cr).c_str(), cr.count(), min_offset); - aux->truffle.accel_type = ACCEL_TRUFFLE; aux->truffle.offset = verify_u8(min_offset); } diff --git a/src/util/arch/arm/arm.h b/src/util/arch/arm/arm.h index c38ac697..3735d066 100644 --- a/src/util/arch/arm/arm.h +++ b/src/util/arch/arm/arm.h @@ -53,5 +53,11 @@ #define HAVE_SVE2_BITPERM #endif +#if defined(HAVE_SVE2) +#define CAN_USE_WIDE_TRUFFLE 1 +#elif defined(HAVE_SVE) +#define CAN_USE_WIDE_TRUFFLE (svcntb() >= 32) +#endif + #endif // UTIL_ARCH_ARM_H_ diff --git a/src/util/arch/arm/simd_types.h b/src/util/arch/arm/simd_types.h index 7dafcf58..c47bfb69 100644 --- a/src/util/arch/arm/simd_types.h +++ b/src/util/arch/arm/simd_types.h @@ -34,5 +34,9 @@ typedef int32x4_t m128; #endif +#if !defined(m256) && defined(m128) +typedef struct {m128 lo; m128 hi;} m256; +#endif + #endif /* SIMD_TYPES_ARM_H */ diff --git a/src/util/supervector/arch/arm/types.hpp b/src/util/supervector/arch/arm/types.hpp index 6e362e1c..718532d7 100644 --- a/src/util/supervector/arch/arm/types.hpp +++ b/src/util/supervector/arch/arm/types.hpp @@ -31,3 +31,6 @@ typedef int32x4_t m128; #endif +#if !defined(m256) && defined(m128) +typedef struct {m128 lo; m128 hi;} m256; +#endif diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index e2196459..18d2fe79 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -107,6 +107,7 @@ set(unit_internal_SOURCES internal/shufti.cpp internal/state_compress.cpp internal/truffle.cpp + internal/truffleWide.cpp internal/unaligned.cpp internal/unicode_set.cpp internal/uniform_ops.cpp diff --git a/unit/internal/sheng.cpp b/unit/internal/sheng.cpp index e8e45ac5..342757d4 100644 --- a/unit/internal/sheng.cpp +++ b/unit/internal/sheng.cpp @@ -290,19 +290,26 @@ struct NFA *get_expected_nfa_header(u8 type, unsigned int length, unsigned int n } struct NFA *get_expected_nfa16_header() { - return get_expected_nfa_header(SHENG_NFA, 4736, 8); + return get_expected_nfa_header(SHENG_NFA, 4736, 8); /* size recorded in 04/2024 */ } #if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) struct NFA *get_expected_nfa32_header() { - return get_expected_nfa_header(SHENG_NFA_32, 17216, 18); + return get_expected_nfa_header(SHENG_NFA_32, 17216, 18); /* size recorded in 04/2024 */ } #endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ void test_nfa_equal(const NFA& l, const NFA& r) { + /** + * The length is meant to be a sanity test: it's not 0 (we compiled something) and that it roughly fit the + * expected size for a given sheng implementation (we don't feed compiled sheng32 into sheng16). + * Changes in other nfa algorithms may affect the sheng length, so we accept small variations. + */ + int relative_difference = std::abs((float)(l.length) - r.length) / ((l.length + r.length) / 2); + EXPECT_LE(relative_difference, 0.1); /* same +-10% */ + EXPECT_EQ(l.flags, r.flags); - EXPECT_EQ(l.length, r.length); EXPECT_EQ(l.type, r.type); EXPECT_EQ(l.rAccelType, r.rAccelType); EXPECT_EQ(l.rAccelOffset, r.rAccelOffset); diff --git a/unit/internal/truffleWide.cpp b/unit/internal/truffleWide.cpp new file mode 100644 index 00000000..733e6ffb --- /dev/null +++ b/unit/internal/truffleWide.cpp @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2024, Arm Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "gtest/gtest.h" +#include "nfa/truffle.h" +#include "nfa/trufflecompile.h" +#include "util/charreach.h" +#include "util/simd_utils.h" + +#include "util/arch.h" +#ifdef HAVE_SVE +using namespace ue2; + +#define SKIP_IF_NO_WIDE_AVAILABLE() \ + if(!CAN_USE_WIDE_TRUFFLE) {\ + std::cout << "[ SKIPPED ] System does not support wide truffle" << std::endl;\ + return;\ + } + +TEST(TruffleWide, CompileDot) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + memset(&mask, 0, sizeof(mask)); + + CharReach chars; + + chars.setall(); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + CharReach out = truffle2crWide((u8 *)&mask); + + ASSERT_EQ(out, chars); + +} + +TEST(TruffleWide, CompileChars) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + // test one char at a time + for (u32 c = 0; c < 256; ++c) { + mask = zeroes256(); + chars.clear(); + chars.set((u8)c); + truffleBuildMasksWide(chars, (u8 *)&mask); + CharReach out = truffle2crWide((u8 *)&mask); + ASSERT_EQ(out, chars); + } + + // set all chars up to dot + for (u32 c = 0; c < 256; ++c) { + mask = zeroes256(); + chars.set((u8)c); + truffleBuildMasksWide(chars, (u8 *)&mask); + CharReach out = truffle2crWide((u8 *)&mask); + ASSERT_EQ(out, chars); + } + + // unset all chars from dot + for (u32 c = 0; c < 256; ++c) { + mask = zeroes256(); + chars.clear((u8)c); + truffleBuildMasksWide(chars, (u8 *)&mask); + CharReach out = truffle2crWide((u8 *)&mask); + ASSERT_EQ(out, chars); + } + +} + +TEST(TruffleWide, ExecNoMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + memset(&mask, 0, sizeof(mask)); + + CharReach chars; + + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\xff"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + strlen(t1), (size_t)rv); + } +} + +TEST(TruffleWide, ExecNoMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + chars.set('B'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + strlen(t1), (size_t)rv); + } +} + +TEST(TruffleWide, ExecNoMatch3) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('V'); /* V = 0x56, e = 0x65 */ + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + strlen(t1), (size_t)rv); + } +} + +TEST(TruffleWide, ExecMiniMatch0) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "a"; + + const u8 *rv = truffleExecWide(mask, (u8 *)t1, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1, (size_t)rv); +} + +TEST(TruffleWide, ExecMiniMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbbabbb"; + + const u8 *rv = truffleExecWide(mask, (u8 *)t1, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + 7, (size_t)rv); +} + +TEST(TruffleWide, ExecMiniMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set(0); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbb\0bbb"; + + const u8 *rv = truffleExecWide(mask, (u8 *)t1, (u8 *)t1 + 11); + + ASSERT_EQ((size_t)t1 + 7, (size_t)rv); +} + +TEST(TruffleWide, ExecMiniMatch3) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "\0\0\0\0\0\0\0a\0\0\0"; + + const u8 *rv = truffleExecWide(mask, (u8 *)t1, (u8 *)t1 + 11); + + ASSERT_EQ((size_t)t1 + 7, (size_t)rv); +} + +TEST(TruffleWide, ExecMatchBig) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + std::array t1; + t1.fill('b'); + t1[120] = 'a'; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1.data() + i, (u8 *)t1.data() + 399); + + ASSERT_LE(((size_t)t1.data() + 120) & ~0xf, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + 17, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + 17, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch3) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + chars.set('B'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbBaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + 17, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch4) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + chars.set('C'); + chars.set('A'); + chars.set('c'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + char t2[] = "bbbbbbbbbbbbbbbbbCaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + char t3[] = "bbbbbbbbbbbbbbbbbcaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + char t4[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = truffleExecWide(mask, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1 + 17, (size_t)rv); + + rv = truffleExecWide(mask, (u8 *)t2 + i, (u8 *)t2 + strlen(t1)); + + ASSERT_EQ((size_t)t2 + 17, (size_t)rv); + + rv = truffleExecWide(mask, (u8 *)t3 + i, (u8 *)t3 + strlen(t3)); + + ASSERT_EQ((size_t)t3 + 17, (size_t)rv); + + rv = truffleExecWide(mask, (u8 *)t4 + i, (u8 *)t4 + strlen(t4)); + + ASSERT_EQ((size_t)t4 + 17, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch5) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + + for (size_t i = 0; i < 31; i++) { + t1[48 - i] = 'a'; + const u8 *rv = truffleExecWide(mask, (u8 *)t1, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)&t1[48 - i], (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch6) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + // [0-Z] - includes some graph chars + chars.setRange('0', 'Z'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + std::array t1; + t1.fill('*'); // it's full of stars! + + for (u8 c = '0'; c <= 'Z'; c++) { + t1[17] = c; + const u8 *rv = truffleExecWide(mask, (u8 *)t1.data(), (u8 *)t1.data() + 128); + + ASSERT_EQ((size_t)t1.data() + 17, (size_t)rv); + } +} + +TEST(TruffleWide, ExecMatch7) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + // hi bits + chars.setRange(127, 255); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + std::array t1; + t1.fill('*'); // it's full of stars! + + for (unsigned int c = 127; c <= 255; c++) { + t1[40] = (u8)c; + const u8 *rv = truffleExecWide(mask, (u8 *)t1.data(), (u8 *)t1.data() + 128); + + ASSERT_EQ((size_t)t1.data() + 40, (size_t)rv); + } +} + +TEST(ReverseTruffleWide, ExecNoMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + char *t1 = t + 1; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_EQ((const u8 *)t, rv); + } +} + +TEST(ReverseTruffleWide, ExecNoMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + + chars.set('a'); + chars.set('B'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t[] = " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + char *t1 = t + 1; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_EQ((const u8 *)t, rv); + } +} + +TEST(ReverseTruffleWide, ExecNoMatch3) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('V'); /* V = 0x56, e = 0x65 */ + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"; + char *t1 = t + 1; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_EQ((const u8 *)t, rv); + } +} + +TEST(ReverseTruffleWide, ExecMiniMatch0) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "a"; + + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + strlen(t1)); + + ASSERT_EQ((size_t)t1, (size_t)rv); +} + +TEST(ReverseTruffleWide, ExecMiniMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbabbbb"; + size_t len = strlen(t1); + + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 7, rv); +} + +TEST(ReverseTruffleWide, ExecMiniMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "babbbbbabbbb"; + size_t len = strlen(t1); + + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 7, rv); +} + + +TEST(ReverseTruffleWide, ExecMatch1) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbabbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 17, rv); + } +} + +TEST(ReverseTruffleWide, ExecMatch2) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbabbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 32, rv); + } +} + +TEST(ReverseTruffleWide, ExecMatch3) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + chars.set('B'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaBbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('B', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 32, rv); + } + + // check that we match the 'a' bytes as well. + ASSERT_EQ('B', t1[32]); + t1[32] = 'b'; + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + ASSERT_NE((const u8 *)t1 - 1, rv); // not found + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 31, rv); + } +} + +TEST(ReverseTruffleWide, ExecMatch4) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + chars.set('C'); + chars.set('A'); + chars.set('c'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + /* 0123456789012345678901234567890 */ + char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + char t2[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaCbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + char t3[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaacbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + char t4[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + size_t len = strlen(t1); + + for (size_t i = 0; i < 16; i++) { + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len - i); + EXPECT_EQ('A', (char)*rv); + ASSERT_EQ((const u8 *)t1 + 32, rv); + + rv = rtruffleExecWide(mask, (u8 *)t2, (u8 *)t2 + len - i); + EXPECT_EQ('C', (char)*rv); + ASSERT_EQ((const u8 *)t2 + 32, rv); + + rv = rtruffleExecWide(mask, (u8 *)t3, (u8 *)t3 + len - i); + EXPECT_EQ('c', (char)*rv); + ASSERT_EQ((const u8 *)t3 + 32, rv); + + rv = rtruffleExecWide(mask, (u8 *)t4, (u8 *)t4 + len - i); + EXPECT_EQ('a', (char)*rv); + ASSERT_EQ((const u8 *)t4 + 32, rv); + } +} + +TEST(ReverseTruffleWide, ExecMatch5) { + SKIP_IF_NO_WIDE_AVAILABLE() + m256 mask; + + CharReach chars; + chars.set('a'); + + truffleBuildMasksWide(chars, (u8 *)&mask); + + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + size_t len = strlen(t1); + + for (size_t i = 0; i < len; i++) { + t1[i] = 'a'; + const u8 *rv = rtruffleExecWide(mask, (u8 *)t1, (u8 *)t1 + len); + + ASSERT_EQ((const u8 *)t1 + i, rv); + } +} +#endif