Reinforced Teddy with 1-byte approach, based on "shift-or" and AVX2.

This commit is contained in:
Chang, Harry 2017-01-22 12:23:25 -08:00 committed by Matthew Barr
parent b09e3acd04
commit dbd3f66e87
10 changed files with 1070 additions and 1233 deletions

View File

@ -78,12 +78,8 @@ struct LitInfo {
struct FDRConfirm {
CONF_TYPE andmsk;
CONF_TYPE mult;
u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
u32 flags; // sole meaning is 'non-zero means no-confirm' (that is all)
u32 nBits;
hwlm_group_t groups;
u32 soleLitSize;
u32 soleLitCmp;
u32 soleLitMsk;
};
static really_inline

View File

@ -130,7 +130,7 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
static
bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
bool make_small, bool make_confirm) {
bool make_small) {
// Every literal must fit within CONF_TYPE.
assert(all_of_in(lits, [](const hwlmLiteral &lit) {
return lit.s.size() <= sizeof(CONF_TYPE);
@ -153,42 +153,6 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
}
CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
u32 flags = 0;
// we use next three variables for 'confirmless' case to speed-up
// confirmation process
u32 soleLitSize = 0;
u32 soleLitCmp = 0;
u32 soleLitMsk = 0;
if (!make_confirm) {
flags = FDRC_FLAG_NO_CONFIRM;
if (lits[0].noruns) {
// messy - need to clean this up later as flags is sorta kinda
// obsoleted
flags |= FDRC_FLAG_NOREPEAT;
}
mult = 0;
soleLitSize = lits[0].s.size() - 1;
// we can get to this point only in confirmless case;
// it means that we have only one literal per FDRConfirm (no packing),
// with no literal mask and size of literal is less or equal
// to the number of masks of Teddy engine;
// maximum number of masks for Teddy is 4, so the size of
// literal is definitely less or equal to size of u32
assert(lits[0].s.size() <= sizeof(u32));
for (u32 i = 0; i < lits[0].s.size(); i++) {
u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
u8 c = lits[0].s[lits[0].s.size() - i - 1];
if (lits[0].nocase && ourisalpha(c)) {
soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
}
else {
soleLitCmp |= (u32)c << shiftLoc;
soleLitMsk |= (u32)0xff << shiftLoc;
}
}
}
// we can walk the vector and assign elements from the vectors to a
// map by hash value
@ -276,11 +240,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
fdrc->andmsk = andmsk;
fdrc->mult = mult;
fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
fdrc->flags = flags;
fdrc->soleLitSize = soleLitSize;
fdrc->soleLitCmp = soleLitCmp;
fdrc->soleLitMsk = soleLitMsk;
fdrc->nBits = nBits;
fdrc->groups = gm;
@ -334,12 +294,8 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
const EngineDescription &eng,
map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
bool make_small) {
bool makeConfirm = true;
unique_ptr<TeddyEngineDescription> teddyDescr =
getTeddyDescription(eng.getID());
if (teddyDescr) {
makeConfirm = teddyDescr->needConfirm(lits);
}
BC2CONF bc2Conf;
u32 totalConfirmSize = 0;
@ -351,7 +307,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
}
DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
auto fc = getFDRConfirm(vl, make_small, makeConfirm);
auto fc = getFDRConfirm(vl, make_small);
totalConfirmSize += fc.size();
bc2Conf.emplace(b, move(fc));
}

View File

@ -43,11 +43,12 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
size_t i, hwlmcb_rv_t *control, u32 *last_match,
u64a conf_key) {
assert(i < a->len);
assert(i >= a->start_offset);
assert(ISALIGNED(fdrc));
const u8 * buf = a->buf;
u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
fdrc->nBitsOrSoleID);
fdrc->nBits);
u32 start = getConfirmLitIndex(fdrc)[c];
if (likely(!start)) {
return;
@ -94,80 +95,4 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
} while (oldNext);
}
// 'light-weight' confirmation function which is used by 1-mask Teddy;
// in the 'confirmless' case it simply calls callback function,
// otherwise it calls 'confWithBit' function for the full confirmation procedure
static really_inline
void confWithBit1(const struct FDRConfirm *fdrc,
const struct FDR_Runtime_Args *a, size_t i,
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
assert(i < a->len);
assert(ISALIGNED(fdrc));
if (unlikely(fdrc->mult)) {
confWithBit(fdrc, a, i, control, last_match, conf_key);
return;
} else {
u32 id = fdrc->nBitsOrSoleID;
if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
return;
}
*last_match = id;
*control = a->cb(i, i, id, a->ctxt);
}
}
// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
// In the 'confirmless' case it makes fast 32-bit comparison,
// otherwise it calls 'confWithBit' function for the full confirmation procedure
static really_inline
void confWithBitMany(const struct FDRConfirm *fdrc,
const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
assert(i < a->len);
assert(ISALIGNED(fdrc));
if (i < a->start_offset) {
return;
}
if (unlikely(fdrc->mult)) {
confWithBit(fdrc, a, i, control, last_match, conf_key);
return;
} else {
const u32 id = fdrc->nBitsOrSoleID;
const u32 len = fdrc->soleLitSize;
if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
return;
}
if (r == VECTORING && len > i - a->start_offset) {
if (len > i + a->len_history) {
return;
}
u32 cmp = (u32)a->buf[i] << 24;
if (len <= i) {
for (u32 j = 1; j <= len; j++) {
cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
}
} else {
for (u32 j = 1; j <= i; j++) {
cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
}
cmp |= (u32)(a->histBytes >> (40 + i * 8));
}
if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
return;
}
}
*last_match = id;
*control = a->cb(i - len, i, id, a->ctxt);
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -40,10 +40,79 @@
#if defined(HAVE_AVX2)
const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
#ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(isnonzero256(var))) { \
if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u64a part1 = extractlow64from256(r); \
@ -51,32 +120,36 @@ do { \
r = interleave256hi(var, swap); \
u64a part3 = extractlow64from256(r); \
u64a part4 = extract64from256(r, 1); \
if (unlikely(part1)) { \
if (unlikely(part1 != ones_u64a)) { \
part1 = ~part1; \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part2)) { \
if (unlikely(part2 != ones_u64a)) { \
part2 = ~part2; \
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part3)) { \
if (unlikely(part3 != ones_u64a)) { \
part3 = ~part3; \
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part4)) { \
if (unlikely(part4 != ones_u64a)) { \
part4 = ~part4; \
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
} while (0);
} while(0)
#else
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(isnonzero256(var))) { \
if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u32 part1 = extractlow32from256(r); \
@ -88,56 +161,65 @@ do { \
u32 part6 = extract32from256(r, 1); \
u32 part7 = extract32from256(r, 2); \
u32 part8 = extract32from256(r, 3); \
if (unlikely(part1)) { \
if (unlikely(part1 != ones_u32)) { \
part1 = ~part1; \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part2)) { \
if (unlikely(part2 != ones_u32)) { \
part2 = ~part2; \
conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
&control, &last_match); \
} \
if (unlikely(part3)) { \
if (unlikely(part3 != ones_u32)) { \
part3 = ~part3; \
conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part4)) { \
if (unlikely(part4 != ones_u32)) { \
part4 = ~part4; \
conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part5)) { \
if (unlikely(part5 != ones_u32)) { \
part5 = ~part5; \
conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part6)) { \
if (unlikely(part6 != ones_u32)) { \
part6 = ~part6; \
conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part7)) { \
if (unlikely(part7 != ones_u32)) { \
part7 = ~part7; \
conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
if (unlikely(part8)) { \
if (unlikely(part8 != ones_u32)) { \
part8 = ~part8; \
conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
} while (0);
} while(0)
#endif
static really_inline
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
len_history, nMasks));
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128);
return ret;
}
@ -147,7 +229,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
return and256(pshufb_m256(maskBase[0*2], lo),
return or256(pshufb_m256(maskBase[0 * 2], lo),
pshufb_m256(maskBase[0 * 2 + 1], hi));
}
@ -158,11 +240,11 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
pshufb_m256(maskBase[1 * 2 + 1], hi));
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
*old_1 = res_1;
return and256(r, res_shifted_1);
return or256(r, res_shifted_1);
}
static really_inline
@ -173,11 +255,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
pshufb_m256(maskBase[2 * 2 + 1], hi));
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
*old_2 = res_2;
return and256(r, res_shifted_2);
return or256(r, res_shifted_2);
}
static really_inline
@ -188,11 +270,11 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
pshufb_m256(maskBase[3 * 2 + 1], hi));
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
*old_3 = res_3;
return and256(r, res_shifted_3);
return or256(r, res_shifted_3);
}
static really_inline
@ -200,486 +282,151 @@ const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
do { \
} while(0)
#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
m256 res_old_1 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
m256 res_old_1 = zeroes256(); \
m256 res_old_2 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
m256 res_old_1 = zeroes256(); \
m256 res_old_2 = zeroes256(); \
m256 res_old_3 = zeroes256();
#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
#define PREP_CONF_FAT_FN_1(mask_base, val) \
prep_conf_fat_teddy_m1(mask_base, val)
#define PREP_CONF_FAT_FN_2(mask_base, val) \
prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
#define PREP_CONF_FAT_FN_3(mask_base, val) \
prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
#define PREP_CONF_FAT_FN_4(mask_base, val) \
prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
#define PREP_CONF_FAT_FN(mask_base, val, n) \
PREP_CONF_FAT_FN_##n(mask_base, val)
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
do { \
const u8 *buf_end = a->buf + a->len; \
const u8 *ptr = a->buf + a->start_offset; \
u32 floodBackoff = FLOOD_BACKOFF_START; \
const u8 *tryFloodDetect = a->firstFloodDetect; \
u32 last_match = (u32)-1; \
const struct Teddy *teddy = (const struct Teddy *)fdr; \
const size_t iterBytes = 32; \
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
a->buf, a->len, a->start_offset); \
\
const m256 *maskBase = getMaskBase_avx2(teddy); \
const u32 *confBase = getConfBase(teddy); \
\
FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
if (ptr < mainStart) { \
ptr = mainStart - 16; \
m256 p_mask; \
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
a->buf, buf_end, \
a->buf_history, a->len_history, \
n_msk); \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
r_0 = or256(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 16; \
} \
\
if (ptr + 16 <= buf_end) { \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
ptr += 16; \
} \
\
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
__builtin_prefetch(ptr + (iterBytes * 4)); \
CHECK_FLOOD; \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
} \
\
if (ptr + 16 <= buf_end) { \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
ptr += 16; \
} \
\
assert(ptr + 16 > buf_end); \
if (ptr < buf_end) { \
m256 p_mask; \
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
a->buf_history, a->len_history, \
n_msk); \
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
r_0 = or256(r_0, p_mask); \
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
} \
\
return HWLM_SUCCESS; \
} while(0)
hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
m256 res_old_2 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
m256 res_old_2 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
m256 res_old_2 = ones256();
m256 res_old_3 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
const u8 *buf_end = a->buf + a->len;
const u8 *ptr = a->buf + a->start_offset;
u32 floodBackoff = FLOOD_BACKOFF_START;
const u8 *tryFloodDetect = a->firstFloodDetect;
u32 last_match = (u32)-1;
const struct Teddy *teddy = (const struct Teddy *)fdr;
const size_t iterBytes = 32;
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
a->buf, a->len, a->start_offset);
const m256 *maskBase = getMaskBase_avx2(teddy);
const u32 *confBase = getConfBase(teddy);
m256 res_old_1 = ones256();
m256 res_old_2 = ones256();
m256 res_old_3 = ones256();
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
if (ptr < mainStart) {
ptr = mainStart - 16;
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16;
}
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
}
for (; ptr < buf_end; ptr += 16) {
m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
}
return HWLM_SUCCESS;
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
#endif // HAVE_AVX2

View File

@ -309,6 +309,169 @@ bool TeddyCompiler::pack(map<BucketIndex,
return true;
}
// this entry has all-zero mask to skip reinforcement
#define NO_REINFORCEMENT N_CHARS
// this means every entry in reinforcement table
#define ALL_CHAR_SET N_CHARS
// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
#define REINFORCED_MSK_LEN 8
static
void initReinforcedTable(u8 *reinforcedMsk) {
u64a *mask = (u64a *)reinforcedMsk;
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
}
static
void fillReinforcedMskZero(u8 *reinforcedMsk) {
u8 *mc = reinforcedMsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
fill_n(mc, REINFORCED_MSK_LEN, 0x00);
}
static
void fillReinforcedMsk(u8 *reinforcedMsk, u16 c, u32 j, u8 bmsk) {
assert(j > 0);
if (c == ALL_CHAR_SET) {
for (size_t i = 0; i < N_CHARS; i++) {
u8 *mc = reinforcedMsk + i * REINFORCED_MSK_LEN;
mc[j - 1] &= ~bmsk;
}
} else {
u8 *mc = reinforcedMsk + c * REINFORCED_MSK_LEN;
mc[j - 1] &= ~bmsk;
}
}
#ifdef TEDDY_DEBUG
static
void dumpReinforcedMaskTable(const u8 *msks) {
for (u32 i = 0; i <= N_CHARS; i++) {
printf("0x%02x: ", i);
for (u32 j = 0; j < REINFORCED_MSK_LEN; j++) {
u8 val = msks[i * REINFORCED_MSK_LEN + j];
for (u32 k = 0; k < 8; k++) {
printf("%s", ((val >> k) & 0x1) ? "1" : "0");
}
printf(" ");
}
printf("\n");
}
}
#endif
static
void fillNibbleMasks(const map<BucketIndex,
vector<LiteralIndex>> &bucketToLits,
const vector<hwlmLiteral> &lits,
u32 numMasks, u32 maskWidth, size_t maskLen,
u8 *baseMsk) {
memset(baseMsk, 0xff, maskLen);
for (const auto &b2l : bucketToLits) {
const u32 &bucket_id = b2l.first;
const vector<LiteralIndex> &ids = b2l.second;
const u8 bmsk = 1U << (bucket_id % 8);
for (const LiteralIndex &lit_id : ids) {
const hwlmLiteral &l = lits[lit_id];
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
const u32 sz = verify_u32(l.s.size());
// fill in masks
for (u32 j = 0; j < numMasks; j++) {
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
const u32 lo_base = msk_id_lo * 16;
const u32 hi_base = msk_id_hi * 16;
// if we don't have a char at this position, fill in i
// locations in these masks with '1'
if (j >= sz) {
for (u32 n = 0; n < 16; n++) {
baseMsk[lo_base + n] &= ~bmsk;
baseMsk[hi_base + n] &= ~bmsk;
}
} else {
u8 c = l.s[sz - 1 - j];
// if we do have a char at this position
const u32 hiShift = 4;
u32 n_hi = (c >> hiShift) & 0xf;
u32 n_lo = c & 0xf;
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
u8 m = l.msk[l.msk.size() - 1 - j];
u8 m_hi = (m >> hiShift) & 0xf;
u8 m_lo = m & 0xf;
u8 cmp = l.cmp[l.msk.size() - 1 - j];
u8 cmp_lo = cmp & 0xf;
u8 cmp_hi = (cmp >> hiShift) & 0xf;
for (u8 cm = 0; cm < 0x10; cm++) {
if ((cm & m_lo) == (cmp_lo & m_lo)) {
baseMsk[lo_base + cm] &= ~bmsk;
}
if ((cm & m_hi) == (cmp_hi & m_hi)) {
baseMsk[hi_base + cm] &= ~bmsk;
}
}
} else {
if (l.nocase && ourisalpha(c)) {
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
} else {
baseMsk[hi_base + n_hi] &= ~bmsk;
}
baseMsk[lo_base + n_lo] &= ~bmsk;
}
}
}
}
}
}
static
void fillReinforcedTable(const map<BucketIndex,
vector<LiteralIndex>> &bucketToLits,
const vector<hwlmLiteral> &lits,
u8 *reinforcedMsk) {
initReinforcedTable(reinforcedMsk);
for (const auto &b2l : bucketToLits) {
const u32 &bucket_id = b2l.first;
const vector<LiteralIndex> &ids = b2l.second;
const u8 bmsk = 1U << (bucket_id % 8);
for (const LiteralIndex &lit_id : ids) {
const hwlmLiteral &l = lits[lit_id];
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
const u32 sz = verify_u32(l.s.size());
// fill in reinforced masks
for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
if (sz - 1 < j) {
fillReinforcedMsk(reinforcedMsk, ALL_CHAR_SET, j, bmsk);
} else {
u8 c = l.s[sz - 1 - j];
if (l.nocase && ourisalpha(c)) {
u8 c_up = c & 0xdf;
fillReinforcedMsk(reinforcedMsk, c_up, j, bmsk);
u8 c_lo = c | 0x20;
fillReinforcedMsk(reinforcedMsk, c_lo, j, bmsk);
} else {
fillReinforcedMsk(reinforcedMsk, c, j, bmsk);
}
}
}
}
}
fillReinforcedMskZero(reinforcedMsk);
}
bytecode_ptr<FDR> TeddyCompiler::build() {
assert(eng.numMasks <= MAX_NUM_MASKS);
@ -329,27 +492,23 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
#endif
map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
if (eng.needConfirm(lits)) {
if (!pack(bucketToLits)) {
DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
lits.size(), eng.getNumBuckets());
return nullptr;
}
} else {
for (u32 i = 0; i < lits.size(); i++) {
bucketToLits[i].push_back(i);
}
}
u32 maskWidth = eng.getNumBuckets() / 8;
size_t headerSize = sizeof(Teddy);
size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
size_t reinforcedMaskLen = (N_CHARS + 1) * REINFORCED_MSK_LEN;
auto floodTable = setupFDRFloodControl(lits, eng, grey);
auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
// Note: we place each major structure here on a cacheline boundary.
size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
ROUNDUP_CL(reinforcedMaskLen) +
ROUNDUP_CL(confirmTable.size()) + floodTable.size();
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
@ -363,7 +522,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
teddy->maxStringLen = verify_u32(maxLen(lits));
// Write confirm structures.
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen);
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
ROUNDUP_CL(reinforcedMaskLen);
assert(ISALIGNED_CL(ptr));
teddy->confOffset = verify_u32(ptr - teddy_base);
memcpy(ptr, confirmTable.get(), confirmTable.size());
@ -377,69 +537,12 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
// Write teddy masks.
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
baseMsk);
for (const auto &b2l : bucketToLits) {
const u32 &bucket_id = b2l.first;
const vector<LiteralIndex> &ids = b2l.second;
const u8 bmsk = 1U << (bucket_id % 8);
for (const LiteralIndex &lit_id : ids) {
const hwlmLiteral &l = lits[lit_id];
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
const u32 sz = verify_u32(l.s.size());
// fill in masks
for (u32 j = 0; j < eng.numMasks; j++) {
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
const u32 lo_base = msk_id_lo * 16;
const u32 hi_base = msk_id_hi * 16;
// if we don't have a char at this position, fill in i
// locations in these masks with '1'
if (j >= sz) {
for (u32 n = 0; n < 16; n++) {
baseMsk[lo_base + n] |= bmsk;
baseMsk[hi_base + n] |= bmsk;
}
} else {
u8 c = l.s[sz - 1 - j];
// if we do have a char at this position
const u32 hiShift = 4;
u32 n_hi = (c >> hiShift) & 0xf;
u32 n_lo = c & 0xf;
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
u8 m = l.msk[l.msk.size() - 1 - j];
u8 m_hi = (m >> hiShift) & 0xf;
u8 m_lo = m & 0xf;
u8 cmp = l.cmp[l.msk.size() - 1 - j];
u8 cmp_lo = cmp & 0xf;
u8 cmp_hi = (cmp >> hiShift) & 0xf;
for (u8 cm = 0; cm < 0x10; cm++) {
if ((cm & m_lo) == (cmp_lo & m_lo)) {
baseMsk[lo_base + cm] |= bmsk;
}
if ((cm & m_hi) == (cmp_hi & m_hi)) {
baseMsk[hi_base + cm] |= bmsk;
}
}
} else {
if (l.nocase && ourisalpha(c)) {
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
baseMsk[hi_base + (n_hi & cmHalfClear)] |= bmsk;
baseMsk[hi_base + (n_hi | cmHalfSet)] |= bmsk;
} else {
baseMsk[hi_base + n_hi] |= bmsk;
}
baseMsk[lo_base + n_lo] |= bmsk;
}
}
}
}
}
// Write reinforcement masks.
u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
fillReinforcedTable(bucketToLits, lits, reinforcedMsk);
#ifdef TEDDY_DEBUG
for (u32 i = 0; i < eng.numMasks * 2; i++) {
@ -452,6 +555,10 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
}
printf("\n");
}
printf("\n===============================================\n"
"reinforced mask table for low boundary (original)\n\n");
dumpReinforcedMaskTable(reinforcedMsk);
#endif
return fdr;

View File

@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
return numMasks;
}
bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
if (packed || lits.size() > getNumBuckets()) {
return true;
}
for (const auto &lit : lits) {
if (lit.s.size() > numMasks || !lit.msk.empty()) {
return true;
}
}
return false;
}
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
static const TeddyEngineDef defns[] = {
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },

View File

@ -55,7 +55,6 @@ public:
explicit TeddyEngineDescription(const TeddyEngineDef &def);
u32 getDefaultFloodSuffixLength() const override;
bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
};
std::unique_ptr<TeddyEngineDescription>

View File

@ -26,6 +26,25 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
/* Teddy bytecode layout:
* * |-----|
* * | | struct Teddy
* * |-----|
* * | | teddy masks
* * | |
* * |-----|
* * | | reinforcement mask table
* * | |
* * |-----|
* * | | confirm
* * | |
* * | |
* * |-----|
* * | | flood control
* * | |
* * |-----|
*/
#ifndef TEDDY_INTERNAL_H
#define TEDDY_INTERNAL_H

View File

@ -38,8 +38,12 @@
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/uniform_ops.h"
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
#if defined(__AVX2__)
extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
#endif
#ifdef ARCH_64_BIT
#define TEDDY_CONF_TYPE u64a
@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
}
// Note: p_mask is an output param that initialises a poison mask.
// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
// m byte 0xff in the beginning, followed by n byte 0x00,
// then followed by the rest bytes 0xff.
// ptr >= lo:
// no history.
// for end/short zone, ptr==lo and start_offset==0
// for start zone, see below
// lo ptr hi hi
// |----------|-------|----------------|............|
// start 0 start+offset end(<=16)
// p_mask ffff..ff0000...........00ffff..........
// ptr < lo:
// only start zone.
// history
// ptr lo hi hi
// |----------|-------|----------------|............|
// 0 start start+offset end(<=16)
// p_mask ffff.....ffffff..ff0000...........00ffff..........
static really_inline
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
union {
@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
uintptr_t copy_start;
uintptr_t copy_len;
if (ptr >= lo) {
if (ptr >= lo) { // short/end/start zone
uintptr_t start = (uintptr_t)(ptr - lo);
uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 16) {
*p_mask = load128(p_mask_arr[16] + 16);
assert(start_offset - start <= 16);
*p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+ 16 - start_offset + start);
return loadu128(ptr);
}
*p_mask = load128(p_mask_arr[avail] + 16);
assert(start_offset - start <= avail);
*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+ 16 - start_offset + start);
copy_start = 0;
copy_len = avail;
} else {
} else { // start zone
uintptr_t need = MIN((uintptr_t)(lo - ptr),
MIN(len_history, nMasks - 1));
uintptr_t start = (uintptr_t)(lo - ptr);
uintptr_t i;
for (i = start - need; ptr + i < lo; i++) {
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
for (i = start - need; i < start; i++) {
u.val8[i] = buf_history[len_history - (start - i)];
}
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
*p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
copy_start = i;
copy_len = end - i;
assert(start + start_offset <= end);
*p_mask = loadu128(p_mask_arr[end - start - start_offset]
+ 16 - start - start_offset);
copy_start = start;
copy_len = end - start;
}
// Runt block from the buffer.
@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
return u.val128;
}
#if defined(__AVX2__)
/*
* \brief Copy a block of [0,31] bytes efficiently.
*
* This function is a workaround intended to stop some compilers from
* synthesizing a memcpy function call out of the copy of a small number of
* bytes that we do in vectoredLoad256.
*/
static really_inline
void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
switch (len) {
case 0:
break;
case 1:
*dst = *src;
break;
case 2:
unaligned_store_u16(dst, unaligned_load_u16(src));
break;
case 3:
unaligned_store_u16(dst, unaligned_load_u16(src));
dst[2] = src[2];
break;
case 4:
unaligned_store_u32(dst, unaligned_load_u32(src));
break;
case 5:
case 6:
case 7:
/* Perform copy with two overlapping 4-byte chunks. */
unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
unaligned_store_u32(dst, unaligned_load_u32(src));
break;
case 8:
unaligned_store_u64a(dst, unaligned_load_u64a(src));
break;
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
/* Perform copy with two overlapping 8-byte chunks. */
unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
unaligned_store_u64a(dst, unaligned_load_u64a(src));
break;
case 16:
storeu128(dst, loadu128(src));
break;
default:
/* Perform copy with two overlapping 16-byte chunks. */
assert(len < 32);
storeu128(dst + len - 16, loadu128(src + len - 16));
storeu128(dst, loadu128(src));
break;
}
}
// Note: p_mask is an output param that initialises a poison mask.
// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
// m byte 0xff in the beginning, followed by n byte 0x00,
// then followed by the rest bytes 0xff.
// ptr >= lo:
// no history.
// for end/short zone, ptr==lo and start_offset==0
// for start zone, see below
// lo ptr hi hi
// |----------|-------|----------------|............|
// start 0 start+offset end(<=32)
// p_mask ffff..ff0000...........00ffff..........
// ptr < lo:
// only start zone.
// history
// ptr lo hi hi
// |----------|-------|----------------|............|
// 0 start start+offset end(<=32)
// p_mask ffff.....ffffff..ff0000...........00ffff..........
static really_inline
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
union {
u8 val8[32];
m256 val256;
} u;
u.val256 = zeroes256();
uintptr_t copy_start;
uintptr_t copy_len;
if (ptr >= lo) { // short/end/start zone
uintptr_t start = (uintptr_t)(ptr - lo);
uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 32) {
assert(start_offset - start <= 32);
*p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+ 32 - start_offset + start);
return loadu256(ptr);
}
assert(start_offset - start <= avail);
*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+ 32 - start_offset + start);
copy_start = 0;
copy_len = avail;
} else { //start zone
uintptr_t need = MIN((uintptr_t)(lo - ptr),
MIN(len_history, nMasks - 1));
uintptr_t start = (uintptr_t)(lo - ptr);
uintptr_t i;
for (i = start - need; i < start; i++) {
u.val8[i] = buf_history[len_history - (start - i)];
}
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
assert(start + start_offset <= end);
*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+ 32 - start - start_offset);
copy_start = start;
copy_len = end - start;
}
// Runt block from the buffer.
copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
return u.val256;
}
#endif // __AVX2__
static really_inline
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
CautionReason reason) {
@ -196,53 +355,17 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
} while (unlikely(*conf));
}
static really_inline
void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
const u32 *confBase, CautionReason reason,
const struct FDR_Runtime_Args *a, const u8 *ptr,
hwlmcb_rv_t *control, u32 *last_match) {
do {
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
u32 byte = bit / bucket + offset;
u32 idx = bit % bucket;
u32 cf = confBase[idx];
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
u64a confVal = getConfVal(a, ptr, byte, reason);
confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
confVal);
} while (unlikely(*conf));
}
static really_inline
void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
const u32 *confBase, CautionReason reason,
const struct FDR_Runtime_Args *a, const u8 *ptr,
hwlmcb_rv_t *control, u32 *last_match) {
do {
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
u32 byte = bit / bucket + offset;
u32 idx = bit % bucket;
u32 cf = confBase[idx];
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
((const u8 *)confBase + cf);
if (!(fdrc->groups & *control)) {
continue;
}
u64a confVal = getConfVal(a, ptr, byte, reason);
confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
last_match, confVal);
} while (unlikely(*conf));
}
static really_inline
const m128 *getMaskBase(const struct Teddy *teddy) {
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
static really_inline
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
return (const u64a *)((const u8 *)getMaskBase(teddy)
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
}
static really_inline
const u32 *getConfBase(const struct Teddy *teddy) {
return (const u32 *)((const u8 *)teddy + teddy->confOffset);