Implement new DoubleVermicelli16 acceleration functions using SVE2

Change-Id: Id4a8ffca840caab930a6e78cc0dfd0fe7d320b4e
This commit is contained in:
George Wort
2021-06-28 16:29:43 +01:00
committed by Konstantinos Margaritis
parent 25183089fd
commit 6c6aee9682
9 changed files with 874 additions and 159 deletions

View File

@@ -91,6 +91,28 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
rv = vermicelli16Exec(accel->verm16.mask, c, c_end);
break;
case ACCEL_DVERM16:
DEBUG_PRINTF("accel dverm16 %p %p\n", c, c_end);
if (c_end - c < 18) {
return c;
}
/* need to stop one early to get an accurate end state */
rv = vermicelliDouble16Exec(accel->dverm16.mask, accel->dverm16.firsts,
c, c_end - 1);
break;
case ACCEL_DVERM16_MASKED:
DEBUG_PRINTF("accel dverm16 masked %p %p\n", c, c_end);
if (c_end - c < 18) {
return c;
}
/* need to stop one early to get an accurate end state */
rv = vermicelliDoubleMasked16Exec(accel->mdverm16.mask, accel->mdverm16.c1,
accel->mdverm16.m1, c, c_end - 1);
break;
#endif // HAVE_SVE2
case ACCEL_DVERM_MASKED:

View File

@@ -63,7 +63,9 @@ enum AccelType {
ACCEL_TRUFFLE,
ACCEL_RED_TAPE,
ACCEL_DVERM_MASKED,
ACCEL_VERM16
ACCEL_VERM16,
ACCEL_DVERM16,
ACCEL_DVERM16_MASKED,
};
/** \brief Structure for accel framework. */
@@ -104,6 +106,19 @@ union AccelAux {
u8 offset;
m128 mask;
} verm16;
struct {
u8 accel_type;
u8 offset;
u64a firsts;
m128 mask;
} dverm16;
struct {
u8 accel_type;
u8 offset;
u8 c1; // used for partial match
u8 m1; // used for partial match
m128 mask;
} mdverm16;
struct {
u8 accel_type;
u8 offset;

View File

@@ -442,45 +442,75 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
return;
}
if (double_byte_ok(info) && info.double_cr.none() &&
(info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
bool ok = true;
if (double_byte_ok(info) && info.double_cr.none()) {
if ((info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
bool ok = true;
assert(!info.double_byte.empty());
u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
assert(!info.double_byte.empty());
u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
for (const pair<u8, u8> &p : info.double_byte) {
if ((p.first & CASE_CLEAR) != firstC ||
(p.second & CASE_CLEAR) != secondC) {
ok = false;
break;
for (const pair<u8, u8> &p : info.double_byte) {
if ((p.first & CASE_CLEAR) != firstC ||
(p.second & CASE_CLEAR) != secondC) {
ok = false;
break;
}
}
if (ok) {
accel->accel_type = ACCEL_DVERM_NOCASE;
accel->dverm.c1 = firstC;
accel->dverm.c2 = secondC;
accel->dverm.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
return;
}
u8 m1;
u8 m2;
if (buildDvermMask(info.double_byte, &m1, &m2)) {
u8 c1 = info.double_byte.begin()->first & m1;
u8 c2 = info.double_byte.begin()->second & m2;
#ifdef HAVE_SVE2
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&accel->mdverm16.mask)) {
accel->accel_type = ACCEL_DVERM16_MASKED;
accel->mdverm16.offset = verify_u8(info.double_offset);
accel->mdverm16.c1 = c1;
accel->mdverm16.m1 = m1;
DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
c1, c2);
return;
} else if (info.double_byte.size() <= 8 &&
vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
(u8 *)&accel->dverm16.firsts)) {
accel->accel_type = ACCEL_DVERM16;
accel->dverm16.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
accel->accel_type = ACCEL_DVERM_MASKED;
accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm.c1 = c1;
accel->dverm.c2 = c2;
accel->dverm.m1 = m1;
accel->dverm.m2 = m2;
DEBUG_PRINTF(
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
return;
}
}
if (ok) {
accel->accel_type = ACCEL_DVERM_NOCASE;
accel->dverm.c1 = firstC;
accel->dverm.c2 = secondC;
accel->dverm.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
return;
}
u8 m1;
u8 m2;
if (buildDvermMask(info.double_byte, &m1, &m2)) {
accel->accel_type = ACCEL_DVERM_MASKED;
accel->dverm.offset = verify_u8(info.double_offset);
accel->dverm.c1 = info.double_byte.begin()->first & m1;
accel->dverm.c2 = info.double_byte.begin()->second & m2;
accel->dverm.m1 = m1;
accel->dverm.m2 = m2;
DEBUG_PRINTF(
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
accel->dverm.c1, accel->dverm.c2);
#ifdef HAVE_SVE2
if (info.double_byte.size() <= 8 &&
vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
(u8 *)&accel->dverm16.firsts)) {
accel->accel_type = ACCEL_DVERM16;
accel->dverm16.offset = verify_u8(info.double_offset);
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
}
if (double_byte_ok(info) &&

View File

@@ -207,16 +207,45 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
u8 m2;
if (buildDvermMask(info.double_stop2, &m1, &m2)) {
u8 c1 = info.double_stop2.begin()->first & m1;
u8 c2 = info.double_stop2.begin()->second & m2;
#ifdef HAVE_SVE2
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&aux->mdverm16.mask)) {
aux->accel_type = ACCEL_DVERM16_MASKED;
aux->mdverm16.offset = offset;
aux->mdverm16.c1 = c1;
aux->mdverm16.m1 = m1;
DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
c1, c2);
return;
} else if (outs2 <= 8 &&
vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
(u8 *)&aux->dverm16.firsts)) {
aux->accel_type = ACCEL_DVERM16;
aux->dverm16.offset = offset;
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
aux->accel_type = ACCEL_DVERM_MASKED;
aux->dverm.offset = offset;
aux->dverm.c1 = info.double_stop2.begin()->first & m1;
aux->dverm.c2 = info.double_stop2.begin()->second & m2;
aux->dverm.c1 = c1;
aux->dverm.c2 = c2;
aux->dverm.m1 = m1;
aux->dverm.m2 = m2;
DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
aux->dverm.c1, aux->dverm.c2);
DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
return;
}
#ifdef HAVE_SVE2
if (outs2 <= 8 &&
vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
(u8 *)&aux->dverm16.firsts)) {
aux->accel_type = ACCEL_DVERM16;
aux->dverm16.offset = offset;
DEBUG_PRINTF("building double16-vermicelli\n");
return;
}
#endif // HAVE_SVE2
}
if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.

View File

@@ -267,9 +267,7 @@ const u8 *rvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end,
}
static really_inline
const u8 *dvermSearch(char c1, char c2, bool nocase, const u8 *buf,
const u8 *buf_end) {
svuint16_t chars = getCharMaskDouble(c1, c2, nocase);
const u8 *dvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end) {
size_t len = buf_end - buf;
if (len <= svcntb()) {
return dvermSearchOnce(chars, buf, buf_end);
@@ -374,7 +372,8 @@ const u8 *vermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
assert(buf < buf_end);
if (buf_end - buf > 1) {
++buf;
const u8 *ptr = dvermSearch(c1, c2, nocase, buf, buf_end);
svuint16_t chars = getCharMaskDouble(c1, c2, nocase);
const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) {
return ptr;
}
@@ -406,42 +405,92 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
}
static really_inline
svuint8_t getDupSVEMaskFrom128(m128 _mask) {
return svld1rq_u8(svptrue_b8(), (const uint8_t *)&_mask);
svuint8_t getDupSVEMaskFrom128(m128 mask) {
return svld1rq_u8(svptrue_b8(), (const uint8_t *)&mask);
}
static really_inline
const u8 *vermicelli16Exec(const m128 _chars, const u8 *buf,
const u8 *vermicelli16Exec(const m128 mask, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("verm16 scan over %td bytes\n", buf_end - buf);
svuint8_t chars = getDupSVEMaskFrom128(_chars);
svuint8_t chars = getDupSVEMaskFrom128(mask);
const u8 *ptr = vermSearch(chars, buf, buf_end, false);
return ptr ? ptr : buf_end;
}
static really_inline
const u8 *nvermicelli16Exec(const m128 _chars, const u8 *buf,
const u8 *nvermicelli16Exec(const m128 mask, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("nverm16 scan over %td bytes\n", buf_end - buf);
svuint8_t chars = getDupSVEMaskFrom128(_chars);
svuint8_t chars = getDupSVEMaskFrom128(mask);
const u8 *ptr = vermSearch(chars, buf, buf_end, true);
return ptr ? ptr : buf_end;
}
static really_inline
const u8 *rvermicelli16Exec(const m128 _chars, const u8 *buf,
const u8 *rvermicelli16Exec(const m128 mask, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("rverm16 scan over %td bytes\n", buf_end - buf);
svuint8_t chars = getDupSVEMaskFrom128(_chars);
svuint8_t chars = getDupSVEMaskFrom128(mask);
const u8 *ptr = rvermSearch(chars, buf, buf_end, false);
return ptr ? ptr : buf - 1;
}
static really_inline
const u8 *rnvermicelli16Exec(const m128 _chars, const u8 *buf,
const u8 *rnvermicelli16Exec(const m128 mask, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("rnverm16 scan over %td bytes\n", buf_end - buf);
svuint8_t chars = getDupSVEMaskFrom128(_chars);
svuint8_t chars = getDupSVEMaskFrom128(mask);
const u8 *ptr = rvermSearch(chars, buf, buf_end, true);
return ptr ? ptr : buf - 1;
}
}
static really_inline
bool vermicelliDouble16CheckPartial(const u64a first_chars, const u8 *buf_end) {
svuint8_t firsts = svreinterpret_u8(svdup_u64(first_chars));
svbool_t matches = svcmpeq(svptrue_b8(), firsts, svdup_u8(buf_end[-1]));
return svptest_any(svptrue_b8(), matches);
}
static really_inline
const u8 *vermicelliDouble16Exec(const m128 mask, const u64a firsts,
const u8 *buf, const u8 *buf_end) {
assert(buf < buf_end);
DEBUG_PRINTF("double verm16 scan over %td bytes\n", buf_end - buf);
if (buf_end - buf > 1) {
++buf;
svuint16_t chars = svreinterpret_u16(getDupSVEMaskFrom128(mask));
const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) {
return ptr;
}
}
/* check for partial match at end */
if (vermicelliDouble16CheckPartial(firsts, buf_end)) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
static really_inline
const u8 *vermicelliDoubleMasked16Exec(const m128 mask, char c1, char m1,
const u8 *buf, const u8 *buf_end) {
assert(buf < buf_end);
DEBUG_PRINTF("double verm16 masked scan over %td bytes\n", buf_end - buf);
if (buf_end - buf > 1) {
++buf;
svuint16_t chars = svreinterpret_u16(getDupSVEMaskFrom128(mask));
const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) {
return ptr;
}
}
/* check for partial match at end */
if ((buf_end[-1] & m1) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}

View File

@@ -50,4 +50,207 @@ bool vermicelli16Build(const CharReach &chars, u8 *rv) {
return true;
}
bool vermicelliDouble16Build(const flat_set<std::pair<u8, u8>> &twochar,
u8 *chars, u8 *firsts) {
constexpr size_t count_limit = 8;
if (twochar.size() > count_limit) return false;
size_t count = 0;
for (const auto &p : twochar) {
firsts[count] = p.first;
chars[2 * count] = p.first;
chars[(2 * count) + 1] = p.second;
++count;
}
for(; count < count_limit; ++count) {
firsts[count] = chars[0];
chars[2 * count] = chars[0];
chars[(2 * count) + 1] = chars[1];
}
return true;
}
static really_inline
void fillMask(u8 matches[], size_t len, u8 *rv) {
for (size_t i = 0; i < 16; ++i) {
rv[i] = matches[i % len];
}
}
static really_inline
void getTwoCases(u8 cases[2], u8 bit, char c) {
const u8 set = 1UL << bit;
cases[0] = c & (~set);
cases[1] = c | set;
}
static really_inline
void getFourCases(u8 cases[4], u8 bit, char case1, char case2) {
const u8 set = 1UL << bit;
cases[0] = case1 & (~set);
cases[1] = case1 | set;
cases[2] = case2 & (~set);
cases[3] = case2 | set;
}
static really_inline
void getEightCases(u8 cases[8], u8 bit, char case1, char case2,
char case3, char case4) {
const u8 set = 1UL << bit;
cases[0] = case1 & (~set);
cases[1] = case1 | set;
cases[2] = case2 & (~set);
cases[3] = case2 | set;
cases[4] = case3 & (~set);
cases[5] = case3 | set;
cases[6] = case4 & (~set);
cases[7] = case4 | set;
}
static really_inline
bool getDoubleMatchesForBits(u8 c1, u8 c2, u8 holes[3], u8 c1_holes,
u8 c2_holes, u8 *rv) {
u8 cases[8];
switch (c1_holes) {
case 0:
switch (c2_holes) {
case 0: {
u8 matches[2] = { c1, c2 };
fillMask(matches, 2, rv);
return true;
}
case 1: {
getTwoCases(cases, holes[0], c2);
u8 matches[4] = { c1, cases[0], c1, cases[1] };
fillMask(matches, 4, rv);
return true;
}
case 2: {
getTwoCases(cases, holes[0], c2);
getFourCases(&cases[2], holes[1], cases[0], cases[1]);
u8 matches[8] = { c1, cases[2], c1, cases[3],
c1, cases[4], c1, cases[5] };
fillMask(matches, 8, rv);
return true;
}
case 3: {
getTwoCases(cases, holes[0], c2);
getFourCases(&cases[4], holes[1], cases[0], cases[1]);
getEightCases(cases, holes[2], cases[4], cases[5],
cases[6], cases[7]);
u8 matches[16] = { c1, cases[0], c1, cases[1],
c1, cases[2], c1, cases[3],
c1, cases[4], c1, cases[5],
c1, cases[6], c1, cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 4);
break;
}
break;
case 1:
getTwoCases(cases, holes[0], c1);
switch (c2_holes) {
case 0: {
u8 matches[4] = { cases[0] , c2, cases[1], c2 };
fillMask(matches, 4, rv);
return true;
}
case 1: {
getTwoCases(&cases[2], holes[1], c2);
u8 matches[8] = { cases[0], cases[2],
cases[0], cases[3],
cases[1], cases[2],
cases[1], cases[3] };
fillMask(matches, 8, rv);
return true;
}
case 2: {
getTwoCases(&cases[2], holes[1], c2);
getFourCases(&cases[4], holes[2], cases[2], cases[3]);
u8 matches[16] = { cases[0], cases[4], cases[0], cases[5],
cases[0], cases[6], cases[0], cases[7],
cases[1], cases[4], cases[1], cases[5],
cases[1], cases[6], cases[1], cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 3);
break;
}
break;
case 2:
getTwoCases(cases, holes[0], c1);
getFourCases(&cases[2], holes[1], cases[0], cases[1]);
switch (c2_holes) {
case 0: {
u8 matches[8] = { cases[2], c2, cases[3], c2,
cases[4], c2, cases[5], c2 };
fillMask(matches, 8, rv);
return true;
}
case 1: {
getTwoCases(&cases[6], holes[2], c2);
u8 matches[16] = { cases[2], cases[6], cases[3], cases[6],
cases[4], cases[6], cases[5], cases[6],
cases[2], cases[7], cases[3], cases[7],
cases[4], cases[7], cases[5], cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 2);
break;
}
break;
case 3: {
assert(!c2_holes);
getTwoCases(cases, holes[0], c1);
getFourCases(&cases[4], holes[1], cases[0], cases[1]);
getEightCases(cases, holes[2], cases[4], cases[5],
cases[6], cases[7]);
u8 matches[16] = { cases[0], c2, cases[1], c2,
cases[2], c2, cases[3], c2,
cases[4], c2, cases[5], c2,
cases[6], c2, cases[7], c2 };
memcpy(rv, matches, sizeof(matches));
return true;
}
}
return false;
}
static really_inline
bool getDoubleMatchesForMask(char c1, char c2, char m1, char m2,
u8 c1_holes, u8 c2_holes, u8 *rv) {
u8 holes[3] = { 0 };
int count = 0;
if (c1_holes) {
for (int i = 0; i < 8; ++i) {
if (!(m1 & (1UL << i))) {
holes[count++] = i;
}
}
}
if (c2_holes) {
for (int i = 0; i < 8; ++i) {
if (!(m2 & (1UL << i))) {
holes[count++] = i;
}
}
}
return getDoubleMatchesForBits(c1, c2, holes, c1_holes, c2_holes, rv);
}
bool vermicelliDoubleMasked16Build(char c1, char c2, char m1, char m2, u8 *rv) {
u8 c1_holes = 8 - __builtin_popcount(m1);
u8 c2_holes = 8 - __builtin_popcount(m2);
if (c1_holes + c2_holes > 3) {
return false;
}
return getDoubleMatchesForMask(c1, c2, m1, m2, c1_holes, c2_holes, rv);
}
} // namespace ue2

View File

@@ -43,6 +43,11 @@ namespace ue2 {
bool vermicelli16Build(const CharReach &chars, u8 *rv);
bool vermicelliDouble16Build(const flat_set<std::pair<u8, u8>> &twochar,
u8 *chars, u8 *firsts);
bool vermicelliDoubleMasked16Build(char c1, char c2, char m1, char m2, u8 *rv);
} // namespace ue2
#endif // VERM_COMPILE_H