Implement new DoubleVermicelli16 acceleration functions using SVE2

Change-Id: Id4a8ffca840caab930a6e78cc0dfd0fe7d320b4e
This commit is contained in:
George Wort
2021-06-28 16:29:43 +01:00
committed by Konstantinos Margaritis
parent 91f5f10831
commit e1f0f6baf7
9 changed files with 874 additions and 159 deletions

View File

@@ -50,4 +50,207 @@ bool vermicelli16Build(const CharReach &chars, u8 *rv) {
return true;
}
bool vermicelliDouble16Build(const flat_set<std::pair<u8, u8>> &twochar,
u8 *chars, u8 *firsts) {
constexpr size_t count_limit = 8;
if (twochar.size() > count_limit) return false;
size_t count = 0;
for (const auto &p : twochar) {
firsts[count] = p.first;
chars[2 * count] = p.first;
chars[(2 * count) + 1] = p.second;
++count;
}
for(; count < count_limit; ++count) {
firsts[count] = chars[0];
chars[2 * count] = chars[0];
chars[(2 * count) + 1] = chars[1];
}
return true;
}
static really_inline
void fillMask(u8 matches[], size_t len, u8 *rv) {
for (size_t i = 0; i < 16; ++i) {
rv[i] = matches[i % len];
}
}
static really_inline
void getTwoCases(u8 cases[2], u8 bit, char c) {
const u8 set = 1UL << bit;
cases[0] = c & (~set);
cases[1] = c | set;
}
static really_inline
void getFourCases(u8 cases[4], u8 bit, char case1, char case2) {
const u8 set = 1UL << bit;
cases[0] = case1 & (~set);
cases[1] = case1 | set;
cases[2] = case2 & (~set);
cases[3] = case2 | set;
}
static really_inline
void getEightCases(u8 cases[8], u8 bit, char case1, char case2,
char case3, char case4) {
const u8 set = 1UL << bit;
cases[0] = case1 & (~set);
cases[1] = case1 | set;
cases[2] = case2 & (~set);
cases[3] = case2 | set;
cases[4] = case3 & (~set);
cases[5] = case3 | set;
cases[6] = case4 & (~set);
cases[7] = case4 | set;
}
static really_inline
bool getDoubleMatchesForBits(u8 c1, u8 c2, u8 holes[3], u8 c1_holes,
u8 c2_holes, u8 *rv) {
u8 cases[8];
switch (c1_holes) {
case 0:
switch (c2_holes) {
case 0: {
u8 matches[2] = { c1, c2 };
fillMask(matches, 2, rv);
return true;
}
case 1: {
getTwoCases(cases, holes[0], c2);
u8 matches[4] = { c1, cases[0], c1, cases[1] };
fillMask(matches, 4, rv);
return true;
}
case 2: {
getTwoCases(cases, holes[0], c2);
getFourCases(&cases[2], holes[1], cases[0], cases[1]);
u8 matches[8] = { c1, cases[2], c1, cases[3],
c1, cases[4], c1, cases[5] };
fillMask(matches, 8, rv);
return true;
}
case 3: {
getTwoCases(cases, holes[0], c2);
getFourCases(&cases[4], holes[1], cases[0], cases[1]);
getEightCases(cases, holes[2], cases[4], cases[5],
cases[6], cases[7]);
u8 matches[16] = { c1, cases[0], c1, cases[1],
c1, cases[2], c1, cases[3],
c1, cases[4], c1, cases[5],
c1, cases[6], c1, cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 4);
break;
}
break;
case 1:
getTwoCases(cases, holes[0], c1);
switch (c2_holes) {
case 0: {
u8 matches[4] = { cases[0] , c2, cases[1], c2 };
fillMask(matches, 4, rv);
return true;
}
case 1: {
getTwoCases(&cases[2], holes[1], c2);
u8 matches[8] = { cases[0], cases[2],
cases[0], cases[3],
cases[1], cases[2],
cases[1], cases[3] };
fillMask(matches, 8, rv);
return true;
}
case 2: {
getTwoCases(&cases[2], holes[1], c2);
getFourCases(&cases[4], holes[2], cases[2], cases[3]);
u8 matches[16] = { cases[0], cases[4], cases[0], cases[5],
cases[0], cases[6], cases[0], cases[7],
cases[1], cases[4], cases[1], cases[5],
cases[1], cases[6], cases[1], cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 3);
break;
}
break;
case 2:
getTwoCases(cases, holes[0], c1);
getFourCases(&cases[2], holes[1], cases[0], cases[1]);
switch (c2_holes) {
case 0: {
u8 matches[8] = { cases[2], c2, cases[3], c2,
cases[4], c2, cases[5], c2 };
fillMask(matches, 8, rv);
return true;
}
case 1: {
getTwoCases(&cases[6], holes[2], c2);
u8 matches[16] = { cases[2], cases[6], cases[3], cases[6],
cases[4], cases[6], cases[5], cases[6],
cases[2], cases[7], cases[3], cases[7],
cases[4], cases[7], cases[5], cases[7] };
memcpy(rv, matches, sizeof(matches));
return true;
}
default:
assert(c2_holes < 2);
break;
}
break;
case 3: {
assert(!c2_holes);
getTwoCases(cases, holes[0], c1);
getFourCases(&cases[4], holes[1], cases[0], cases[1]);
getEightCases(cases, holes[2], cases[4], cases[5],
cases[6], cases[7]);
u8 matches[16] = { cases[0], c2, cases[1], c2,
cases[2], c2, cases[3], c2,
cases[4], c2, cases[5], c2,
cases[6], c2, cases[7], c2 };
memcpy(rv, matches, sizeof(matches));
return true;
}
}
return false;
}
static really_inline
bool getDoubleMatchesForMask(char c1, char c2, char m1, char m2,
u8 c1_holes, u8 c2_holes, u8 *rv) {
u8 holes[3] = { 0 };
int count = 0;
if (c1_holes) {
for (int i = 0; i < 8; ++i) {
if (!(m1 & (1UL << i))) {
holes[count++] = i;
}
}
}
if (c2_holes) {
for (int i = 0; i < 8; ++i) {
if (!(m2 & (1UL << i))) {
holes[count++] = i;
}
}
}
return getDoubleMatchesForBits(c1, c2, holes, c1_holes, c2_holes, rv);
}
bool vermicelliDoubleMasked16Build(char c1, char c2, char m1, char m2, u8 *rv) {
u8 c1_holes = 8 - __builtin_popcount(m1);
u8 c2_holes = 8 - __builtin_popcount(m2);
if (c1_holes + c2_holes > 3) {
return false;
}
return getDoubleMatchesForMask(c1, c2, m1, m2, c1_holes, c2_holes, rv);
}
} // namespace ue2