mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Reinforced Teddy with 1-byte approach, based on "shift-or" and AVX2.
This commit is contained in:
parent
b09e3acd04
commit
dbd3f66e87
@ -78,12 +78,8 @@ struct LitInfo {
|
||||
struct FDRConfirm {
|
||||
CONF_TYPE andmsk;
|
||||
CONF_TYPE mult;
|
||||
u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
|
||||
u32 flags; // sole meaning is 'non-zero means no-confirm' (that is all)
|
||||
u32 nBits;
|
||||
hwlm_group_t groups;
|
||||
u32 soleLitSize;
|
||||
u32 soleLitCmp;
|
||||
u32 soleLitMsk;
|
||||
};
|
||||
|
||||
static really_inline
|
||||
|
@ -130,7 +130,7 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
|
||||
|
||||
static
|
||||
bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
bool make_small, bool make_confirm) {
|
||||
bool make_small) {
|
||||
// Every literal must fit within CONF_TYPE.
|
||||
assert(all_of_in(lits, [](const hwlmLiteral &lit) {
|
||||
return lit.s.size() <= sizeof(CONF_TYPE);
|
||||
@ -153,42 +153,6 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
}
|
||||
|
||||
CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
|
||||
u32 flags = 0;
|
||||
// we use next three variables for 'confirmless' case to speed-up
|
||||
// confirmation process
|
||||
u32 soleLitSize = 0;
|
||||
u32 soleLitCmp = 0;
|
||||
u32 soleLitMsk = 0;
|
||||
|
||||
if (!make_confirm) {
|
||||
flags = FDRC_FLAG_NO_CONFIRM;
|
||||
if (lits[0].noruns) {
|
||||
// messy - need to clean this up later as flags is sorta kinda
|
||||
// obsoleted
|
||||
flags |= FDRC_FLAG_NOREPEAT;
|
||||
}
|
||||
mult = 0;
|
||||
soleLitSize = lits[0].s.size() - 1;
|
||||
// we can get to this point only in confirmless case;
|
||||
// it means that we have only one literal per FDRConfirm (no packing),
|
||||
// with no literal mask and size of literal is less or equal
|
||||
// to the number of masks of Teddy engine;
|
||||
// maximum number of masks for Teddy is 4, so the size of
|
||||
// literal is definitely less or equal to size of u32
|
||||
assert(lits[0].s.size() <= sizeof(u32));
|
||||
for (u32 i = 0; i < lits[0].s.size(); i++) {
|
||||
u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
|
||||
u8 c = lits[0].s[lits[0].s.size() - i - 1];
|
||||
if (lits[0].nocase && ourisalpha(c)) {
|
||||
soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
|
||||
soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
|
||||
}
|
||||
else {
|
||||
soleLitCmp |= (u32)c << shiftLoc;
|
||||
soleLitMsk |= (u32)0xff << shiftLoc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we can walk the vector and assign elements from the vectors to a
|
||||
// map by hash value
|
||||
@ -276,11 +240,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
|
||||
fdrc->andmsk = andmsk;
|
||||
fdrc->mult = mult;
|
||||
fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
|
||||
fdrc->flags = flags;
|
||||
fdrc->soleLitSize = soleLitSize;
|
||||
fdrc->soleLitCmp = soleLitCmp;
|
||||
fdrc->soleLitMsk = soleLitMsk;
|
||||
fdrc->nBits = nBits;
|
||||
|
||||
fdrc->groups = gm;
|
||||
|
||||
@ -334,12 +294,8 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
||||
const EngineDescription &eng,
|
||||
map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
|
||||
bool make_small) {
|
||||
bool makeConfirm = true;
|
||||
unique_ptr<TeddyEngineDescription> teddyDescr =
|
||||
getTeddyDescription(eng.getID());
|
||||
if (teddyDescr) {
|
||||
makeConfirm = teddyDescr->needConfirm(lits);
|
||||
}
|
||||
|
||||
BC2CONF bc2Conf;
|
||||
u32 totalConfirmSize = 0;
|
||||
@ -351,7 +307,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
|
||||
auto fc = getFDRConfirm(vl, make_small, makeConfirm);
|
||||
auto fc = getFDRConfirm(vl, make_small);
|
||||
totalConfirmSize += fc.size();
|
||||
bc2Conf.emplace(b, move(fc));
|
||||
}
|
||||
|
@ -43,11 +43,12 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
||||
size_t i, hwlmcb_rv_t *control, u32 *last_match,
|
||||
u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(i >= a->start_offset);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
const u8 * buf = a->buf;
|
||||
u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
|
||||
fdrc->nBitsOrSoleID);
|
||||
fdrc->nBits);
|
||||
u32 start = getConfirmLitIndex(fdrc)[c];
|
||||
if (likely(!start)) {
|
||||
return;
|
||||
@ -94,80 +95,4 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
||||
} while (oldNext);
|
||||
}
|
||||
|
||||
// 'light-weight' confirmation function which is used by 1-mask Teddy;
|
||||
// in the 'confirmless' case it simply calls callback function,
|
||||
// otherwise it calls 'confWithBit' function for the full confirmation procedure
|
||||
static really_inline
|
||||
void confWithBit1(const struct FDRConfirm *fdrc,
|
||||
const struct FDR_Runtime_Args *a, size_t i,
|
||||
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
if (unlikely(fdrc->mult)) {
|
||||
confWithBit(fdrc, a, i, control, last_match, conf_key);
|
||||
return;
|
||||
} else {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
|
||||
if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
|
||||
return;
|
||||
}
|
||||
*last_match = id;
|
||||
*control = a->cb(i, i, id, a->ctxt);
|
||||
}
|
||||
}
|
||||
|
||||
// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
|
||||
// In the 'confirmless' case it makes fast 32-bit comparison,
|
||||
// otherwise it calls 'confWithBit' function for the full confirmation procedure
|
||||
static really_inline
|
||||
void confWithBitMany(const struct FDRConfirm *fdrc,
|
||||
const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
|
||||
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
if (i < a->start_offset) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(fdrc->mult)) {
|
||||
confWithBit(fdrc, a, i, control, last_match, conf_key);
|
||||
return;
|
||||
} else {
|
||||
const u32 id = fdrc->nBitsOrSoleID;
|
||||
const u32 len = fdrc->soleLitSize;
|
||||
|
||||
if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (r == VECTORING && len > i - a->start_offset) {
|
||||
if (len > i + a->len_history) {
|
||||
return;
|
||||
}
|
||||
|
||||
u32 cmp = (u32)a->buf[i] << 24;
|
||||
|
||||
if (len <= i) {
|
||||
for (u32 j = 1; j <= len; j++) {
|
||||
cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
|
||||
}
|
||||
} else {
|
||||
for (u32 j = 1; j <= i; j++) {
|
||||
cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
|
||||
}
|
||||
cmp |= (u32)(a->histBytes >> (40 + i * 8));
|
||||
}
|
||||
|
||||
if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
*last_match = id;
|
||||
*control = a->cb(i - len, i, id, a->ctxt);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
955
src/fdr/teddy.c
955
src/fdr/teddy.c
File diff suppressed because it is too large
Load Diff
@ -40,10 +40,79 @@
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
||||
};
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(isnonzero256(var))) { \
|
||||
if (unlikely(diff256(var, ones256()))) { \
|
||||
m256 swap = swap128in256(var); \
|
||||
m256 r = interleave256lo(var, swap); \
|
||||
u64a part1 = extractlow64from256(r); \
|
||||
@ -51,32 +120,36 @@ do { \
|
||||
r = interleave256hi(var, swap); \
|
||||
u64a part3 = extractlow64from256(r); \
|
||||
u64a part4 = extract64from256(r, 1); \
|
||||
if (unlikely(part1)) { \
|
||||
if (unlikely(part1 != ones_u64a)) { \
|
||||
part1 = ~part1; \
|
||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part2)) { \
|
||||
if (unlikely(part2 != ones_u64a)) { \
|
||||
part2 = ~part2; \
|
||||
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part3)) { \
|
||||
if (unlikely(part3 != ones_u64a)) { \
|
||||
part3 = ~part3; \
|
||||
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part4)) { \
|
||||
if (unlikely(part4 != ones_u64a)) { \
|
||||
part4 = ~part4; \
|
||||
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} \
|
||||
} while (0);
|
||||
} while(0)
|
||||
#else
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(isnonzero256(var))) { \
|
||||
if (unlikely(diff256(var, ones256()))) { \
|
||||
m256 swap = swap128in256(var); \
|
||||
m256 r = interleave256lo(var, swap); \
|
||||
u32 part1 = extractlow32from256(r); \
|
||||
@ -88,56 +161,65 @@ do { \
|
||||
u32 part6 = extract32from256(r, 1); \
|
||||
u32 part7 = extract32from256(r, 2); \
|
||||
u32 part8 = extract32from256(r, 3); \
|
||||
if (unlikely(part1)) { \
|
||||
if (unlikely(part1 != ones_u32)) { \
|
||||
part1 = ~part1; \
|
||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part2)) { \
|
||||
if (unlikely(part2 != ones_u32)) { \
|
||||
part2 = ~part2; \
|
||||
conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
} \
|
||||
if (unlikely(part3)) { \
|
||||
if (unlikely(part3 != ones_u32)) { \
|
||||
part3 = ~part3; \
|
||||
conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part4)) { \
|
||||
if (unlikely(part4 != ones_u32)) { \
|
||||
part4 = ~part4; \
|
||||
conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part5)) { \
|
||||
if (unlikely(part5 != ones_u32)) { \
|
||||
part5 = ~part5; \
|
||||
conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part6)) { \
|
||||
if (unlikely(part6 != ones_u32)) { \
|
||||
part6 = ~part6; \
|
||||
conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part7)) { \
|
||||
if (unlikely(part7 != ones_u32)) { \
|
||||
part7 = ~part7; \
|
||||
conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part8)) { \
|
||||
if (unlikely(part8 != ones_u32)) { \
|
||||
part8 = ~part8; \
|
||||
conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} \
|
||||
} while (0);
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
|
||||
len_history, nMasks));
|
||||
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
@ -147,7 +229,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
return and256(pshufb_m256(maskBase[0*2], lo),
|
||||
return or256(pshufb_m256(maskBase[0 * 2], lo),
|
||||
pshufb_m256(maskBase[0 * 2 + 1], hi));
|
||||
}
|
||||
|
||||
@ -158,11 +240,11 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
||||
|
||||
m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
|
||||
m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
|
||||
pshufb_m256(maskBase[1 * 2 + 1], hi));
|
||||
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
|
||||
*old_1 = res_1;
|
||||
return and256(r, res_shifted_1);
|
||||
return or256(r, res_shifted_1);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
@ -173,11 +255,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
||||
|
||||
m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
|
||||
m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
|
||||
pshufb_m256(maskBase[2 * 2 + 1], hi));
|
||||
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
|
||||
*old_2 = res_2;
|
||||
return and256(r, res_shifted_2);
|
||||
return or256(r, res_shifted_2);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
@ -188,11 +270,11 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
||||
|
||||
m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
|
||||
m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
|
||||
pshufb_m256(maskBase[3 * 2 + 1], hi));
|
||||
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
|
||||
*old_3 = res_3;
|
||||
return and256(r, res_shifted_3);
|
||||
return or256(r, res_shifted_3);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
@ -200,486 +282,151 @@ const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
|
||||
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
}
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
|
||||
do { \
|
||||
} while(0)
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
|
||||
m256 res_old_1 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
|
||||
m256 res_old_1 = zeroes256(); \
|
||||
m256 res_old_2 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
|
||||
m256 res_old_1 = zeroes256(); \
|
||||
m256 res_old_2 = zeroes256(); \
|
||||
m256 res_old_3 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
|
||||
|
||||
#define PREP_CONF_FAT_FN_1(mask_base, val) \
|
||||
prep_conf_fat_teddy_m1(mask_base, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_2(mask_base, val) \
|
||||
prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_3(mask_base, val) \
|
||||
prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_4(mask_base, val) \
|
||||
prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN(mask_base, val, n) \
|
||||
PREP_CONF_FAT_FN_##n(mask_base, val)
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||
do { \
|
||||
const u8 *buf_end = a->buf + a->len; \
|
||||
const u8 *ptr = a->buf + a->start_offset; \
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
||||
u32 last_match = (u32)-1; \
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
||||
const size_t iterBytes = 32; \
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
||||
a->buf, a->len, a->start_offset); \
|
||||
\
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy); \
|
||||
const u32 *confBase = getConfBase(teddy); \
|
||||
\
|
||||
FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
|
||||
if (ptr < mainStart) { \
|
||||
ptr = mainStart - 16; \
|
||||
m256 p_mask; \
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
|
||||
a->buf, buf_end, \
|
||||
a->buf_history, a->len_history, \
|
||||
n_msk); \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
||||
r_0 = or256(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
if (ptr + 16 <= buf_end) { \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
|
||||
__builtin_prefetch(ptr + (iterBytes * 4)); \
|
||||
CHECK_FLOOD; \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
|
||||
} \
|
||||
\
|
||||
if (ptr + 16 <= buf_end) { \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
assert(ptr + 16 > buf_end); \
|
||||
if (ptr < buf_end) { \
|
||||
m256 p_mask; \
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
|
||||
a->buf_history, a->len_history, \
|
||||
n_msk); \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
||||
r_0 = or256(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
} \
|
||||
\
|
||||
return HWLM_SUCCESS; \
|
||||
} while(0)
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
|
||||
load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
|
||||
load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
m256 res_old_2 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
m256 res_old_2 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
m256 res_old_2 = ones256();
|
||||
m256 res_old_3 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_avx2(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = ones256();
|
||||
m256 res_old_2 = ones256();
|
||||
m256 res_old_3 = ones256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr));
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, load2x128(ptr + 16));
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, val_0);
|
||||
r_0 = and256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
#endif // HAVE_AVX2
|
||||
|
@ -309,6 +309,169 @@ bool TeddyCompiler::pack(map<BucketIndex,
|
||||
return true;
|
||||
}
|
||||
|
||||
// this entry has all-zero mask to skip reinforcement
|
||||
#define NO_REINFORCEMENT N_CHARS
|
||||
|
||||
// this means every entry in reinforcement table
|
||||
#define ALL_CHAR_SET N_CHARS
|
||||
|
||||
// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
|
||||
#define REINFORCED_MSK_LEN 8
|
||||
|
||||
static
|
||||
void initReinforcedTable(u8 *reinforcedMsk) {
|
||||
u64a *mask = (u64a *)reinforcedMsk;
|
||||
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
|
||||
}
|
||||
|
||||
static
|
||||
void fillReinforcedMskZero(u8 *reinforcedMsk) {
|
||||
u8 *mc = reinforcedMsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
|
||||
fill_n(mc, REINFORCED_MSK_LEN, 0x00);
|
||||
}
|
||||
|
||||
static
|
||||
void fillReinforcedMsk(u8 *reinforcedMsk, u16 c, u32 j, u8 bmsk) {
|
||||
assert(j > 0);
|
||||
if (c == ALL_CHAR_SET) {
|
||||
for (size_t i = 0; i < N_CHARS; i++) {
|
||||
u8 *mc = reinforcedMsk + i * REINFORCED_MSK_LEN;
|
||||
mc[j - 1] &= ~bmsk;
|
||||
}
|
||||
} else {
|
||||
u8 *mc = reinforcedMsk + c * REINFORCED_MSK_LEN;
|
||||
mc[j - 1] &= ~bmsk;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TEDDY_DEBUG
|
||||
static
|
||||
void dumpReinforcedMaskTable(const u8 *msks) {
|
||||
for (u32 i = 0; i <= N_CHARS; i++) {
|
||||
printf("0x%02x: ", i);
|
||||
for (u32 j = 0; j < REINFORCED_MSK_LEN; j++) {
|
||||
u8 val = msks[i * REINFORCED_MSK_LEN + j];
|
||||
for (u32 k = 0; k < 8; k++) {
|
||||
printf("%s", ((val >> k) & 0x1) ? "1" : "0");
|
||||
}
|
||||
printf(" ");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static
|
||||
void fillNibbleMasks(const map<BucketIndex,
|
||||
vector<LiteralIndex>> &bucketToLits,
|
||||
const vector<hwlmLiteral> &lits,
|
||||
u32 numMasks, u32 maskWidth, size_t maskLen,
|
||||
u8 *baseMsk) {
|
||||
memset(baseMsk, 0xff, maskLen);
|
||||
|
||||
for (const auto &b2l : bucketToLits) {
|
||||
const u32 &bucket_id = b2l.first;
|
||||
const vector<LiteralIndex> &ids = b2l.second;
|
||||
const u8 bmsk = 1U << (bucket_id % 8);
|
||||
|
||||
for (const LiteralIndex &lit_id : ids) {
|
||||
const hwlmLiteral &l = lits[lit_id];
|
||||
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
|
||||
const u32 sz = verify_u32(l.s.size());
|
||||
|
||||
// fill in masks
|
||||
for (u32 j = 0; j < numMasks; j++) {
|
||||
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
|
||||
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
|
||||
const u32 lo_base = msk_id_lo * 16;
|
||||
const u32 hi_base = msk_id_hi * 16;
|
||||
|
||||
// if we don't have a char at this position, fill in i
|
||||
// locations in these masks with '1'
|
||||
if (j >= sz) {
|
||||
for (u32 n = 0; n < 16; n++) {
|
||||
baseMsk[lo_base + n] &= ~bmsk;
|
||||
baseMsk[hi_base + n] &= ~bmsk;
|
||||
}
|
||||
} else {
|
||||
u8 c = l.s[sz - 1 - j];
|
||||
// if we do have a char at this position
|
||||
const u32 hiShift = 4;
|
||||
u32 n_hi = (c >> hiShift) & 0xf;
|
||||
u32 n_lo = c & 0xf;
|
||||
|
||||
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
|
||||
u8 m = l.msk[l.msk.size() - 1 - j];
|
||||
u8 m_hi = (m >> hiShift) & 0xf;
|
||||
u8 m_lo = m & 0xf;
|
||||
u8 cmp = l.cmp[l.msk.size() - 1 - j];
|
||||
u8 cmp_lo = cmp & 0xf;
|
||||
u8 cmp_hi = (cmp >> hiShift) & 0xf;
|
||||
|
||||
for (u8 cm = 0; cm < 0x10; cm++) {
|
||||
if ((cm & m_lo) == (cmp_lo & m_lo)) {
|
||||
baseMsk[lo_base + cm] &= ~bmsk;
|
||||
}
|
||||
if ((cm & m_hi) == (cmp_hi & m_hi)) {
|
||||
baseMsk[hi_base + cm] &= ~bmsk;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (l.nocase && ourisalpha(c)) {
|
||||
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
|
||||
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
|
||||
baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
|
||||
baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
|
||||
} else {
|
||||
baseMsk[hi_base + n_hi] &= ~bmsk;
|
||||
}
|
||||
baseMsk[lo_base + n_lo] &= ~bmsk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void fillReinforcedTable(const map<BucketIndex,
|
||||
vector<LiteralIndex>> &bucketToLits,
|
||||
const vector<hwlmLiteral> &lits,
|
||||
u8 *reinforcedMsk) {
|
||||
initReinforcedTable(reinforcedMsk);
|
||||
|
||||
for (const auto &b2l : bucketToLits) {
|
||||
const u32 &bucket_id = b2l.first;
|
||||
const vector<LiteralIndex> &ids = b2l.second;
|
||||
const u8 bmsk = 1U << (bucket_id % 8);
|
||||
|
||||
for (const LiteralIndex &lit_id : ids) {
|
||||
const hwlmLiteral &l = lits[lit_id];
|
||||
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
|
||||
const u32 sz = verify_u32(l.s.size());
|
||||
|
||||
// fill in reinforced masks
|
||||
for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
|
||||
if (sz - 1 < j) {
|
||||
fillReinforcedMsk(reinforcedMsk, ALL_CHAR_SET, j, bmsk);
|
||||
} else {
|
||||
u8 c = l.s[sz - 1 - j];
|
||||
if (l.nocase && ourisalpha(c)) {
|
||||
u8 c_up = c & 0xdf;
|
||||
fillReinforcedMsk(reinforcedMsk, c_up, j, bmsk);
|
||||
u8 c_lo = c | 0x20;
|
||||
fillReinforcedMsk(reinforcedMsk, c_lo, j, bmsk);
|
||||
} else {
|
||||
fillReinforcedMsk(reinforcedMsk, c, j, bmsk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fillReinforcedMskZero(reinforcedMsk);
|
||||
}
|
||||
|
||||
bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
assert(eng.numMasks <= MAX_NUM_MASKS);
|
||||
|
||||
@ -329,27 +492,23 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
#endif
|
||||
|
||||
map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
|
||||
if (eng.needConfirm(lits)) {
|
||||
if (!pack(bucketToLits)) {
|
||||
DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
|
||||
lits.size(), eng.getNumBuckets());
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
for (u32 i = 0; i < lits.size(); i++) {
|
||||
bucketToLits[i].push_back(i);
|
||||
}
|
||||
}
|
||||
u32 maskWidth = eng.getNumBuckets() / 8;
|
||||
|
||||
size_t headerSize = sizeof(Teddy);
|
||||
size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
|
||||
size_t reinforcedMaskLen = (N_CHARS + 1) * REINFORCED_MSK_LEN;
|
||||
|
||||
auto floodTable = setupFDRFloodControl(lits, eng, grey);
|
||||
auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
|
||||
|
||||
// Note: we place each major structure here on a cacheline boundary.
|
||||
size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
|
||||
ROUNDUP_CL(reinforcedMaskLen) +
|
||||
ROUNDUP_CL(confirmTable.size()) + floodTable.size();
|
||||
|
||||
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
||||
@ -363,7 +522,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
teddy->maxStringLen = verify_u32(maxLen(lits));
|
||||
|
||||
// Write confirm structures.
|
||||
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen);
|
||||
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
|
||||
ROUNDUP_CL(reinforcedMaskLen);
|
||||
assert(ISALIGNED_CL(ptr));
|
||||
teddy->confOffset = verify_u32(ptr - teddy_base);
|
||||
memcpy(ptr, confirmTable.get(), confirmTable.size());
|
||||
@ -377,69 +537,12 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
|
||||
// Write teddy masks.
|
||||
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
||||
fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
|
||||
baseMsk);
|
||||
|
||||
for (const auto &b2l : bucketToLits) {
|
||||
const u32 &bucket_id = b2l.first;
|
||||
const vector<LiteralIndex> &ids = b2l.second;
|
||||
const u8 bmsk = 1U << (bucket_id % 8);
|
||||
|
||||
for (const LiteralIndex &lit_id : ids) {
|
||||
const hwlmLiteral &l = lits[lit_id];
|
||||
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
|
||||
const u32 sz = verify_u32(l.s.size());
|
||||
|
||||
// fill in masks
|
||||
for (u32 j = 0; j < eng.numMasks; j++) {
|
||||
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
|
||||
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
|
||||
const u32 lo_base = msk_id_lo * 16;
|
||||
const u32 hi_base = msk_id_hi * 16;
|
||||
|
||||
// if we don't have a char at this position, fill in i
|
||||
// locations in these masks with '1'
|
||||
if (j >= sz) {
|
||||
for (u32 n = 0; n < 16; n++) {
|
||||
baseMsk[lo_base + n] |= bmsk;
|
||||
baseMsk[hi_base + n] |= bmsk;
|
||||
}
|
||||
} else {
|
||||
u8 c = l.s[sz - 1 - j];
|
||||
// if we do have a char at this position
|
||||
const u32 hiShift = 4;
|
||||
u32 n_hi = (c >> hiShift) & 0xf;
|
||||
u32 n_lo = c & 0xf;
|
||||
|
||||
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
|
||||
u8 m = l.msk[l.msk.size() - 1 - j];
|
||||
u8 m_hi = (m >> hiShift) & 0xf;
|
||||
u8 m_lo = m & 0xf;
|
||||
u8 cmp = l.cmp[l.msk.size() - 1 - j];
|
||||
u8 cmp_lo = cmp & 0xf;
|
||||
u8 cmp_hi = (cmp >> hiShift) & 0xf;
|
||||
|
||||
for (u8 cm = 0; cm < 0x10; cm++) {
|
||||
if ((cm & m_lo) == (cmp_lo & m_lo)) {
|
||||
baseMsk[lo_base + cm] |= bmsk;
|
||||
}
|
||||
if ((cm & m_hi) == (cmp_hi & m_hi)) {
|
||||
baseMsk[hi_base + cm] |= bmsk;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (l.nocase && ourisalpha(c)) {
|
||||
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
|
||||
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
|
||||
baseMsk[hi_base + (n_hi & cmHalfClear)] |= bmsk;
|
||||
baseMsk[hi_base + (n_hi | cmHalfSet)] |= bmsk;
|
||||
} else {
|
||||
baseMsk[hi_base + n_hi] |= bmsk;
|
||||
}
|
||||
baseMsk[lo_base + n_lo] |= bmsk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Write reinforcement masks.
|
||||
u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
fillReinforcedTable(bucketToLits, lits, reinforcedMsk);
|
||||
|
||||
#ifdef TEDDY_DEBUG
|
||||
for (u32 i = 0; i < eng.numMasks * 2; i++) {
|
||||
@ -452,6 +555,10 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\n===============================================\n"
|
||||
"reinforced mask table for low boundary (original)\n\n");
|
||||
dumpReinforcedMaskTable(reinforcedMsk);
|
||||
#endif
|
||||
|
||||
return fdr;
|
||||
|
@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
|
||||
return numMasks;
|
||||
}
|
||||
|
||||
bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
|
||||
if (packed || lits.size() > getNumBuckets()) {
|
||||
return true;
|
||||
}
|
||||
for (const auto &lit : lits) {
|
||||
if (lit.s.size() > numMasks || !lit.msk.empty()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
||||
static const TeddyEngineDef defns[] = {
|
||||
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
|
||||
|
@ -55,7 +55,6 @@ public:
|
||||
explicit TeddyEngineDescription(const TeddyEngineDef &def);
|
||||
|
||||
u32 getDefaultFloodSuffixLength() const override;
|
||||
bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
|
||||
};
|
||||
|
||||
std::unique_ptr<TeddyEngineDescription>
|
||||
|
@ -26,6 +26,25 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Teddy bytecode layout:
|
||||
* * |-----|
|
||||
* * | | struct Teddy
|
||||
* * |-----|
|
||||
* * | | teddy masks
|
||||
* * | |
|
||||
* * |-----|
|
||||
* * | | reinforcement mask table
|
||||
* * | |
|
||||
* * |-----|
|
||||
* * | | confirm
|
||||
* * | |
|
||||
* * | |
|
||||
* * |-----|
|
||||
* * | | flood control
|
||||
* * | |
|
||||
* * |-----|
|
||||
*/
|
||||
|
||||
#ifndef TEDDY_INTERNAL_H
|
||||
#define TEDDY_INTERNAL_H
|
||||
|
||||
|
@ -38,8 +38,12 @@
|
||||
#include "ue2common.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/uniform_ops.h"
|
||||
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
||||
#if defined(__AVX2__)
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define TEDDY_CONF_TYPE u64a
|
||||
@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
||||
}
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
|
||||
// m byte 0xff in the beginning, followed by n byte 0x00,
|
||||
// then followed by the rest bytes 0xff.
|
||||
// ptr >= lo:
|
||||
// no history.
|
||||
// for end/short zone, ptr==lo and start_offset==0
|
||||
// for start zone, see below
|
||||
// lo ptr hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// start 0 start+offset end(<=16)
|
||||
// p_mask ffff..ff0000...........00ffff..........
|
||||
// ptr < lo:
|
||||
// only start zone.
|
||||
// history
|
||||
// ptr lo hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=16)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
static really_inline
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
|
||||
if (ptr >= lo) {
|
||||
if (ptr >= lo) { // short/end/start zone
|
||||
uintptr_t start = (uintptr_t)(ptr - lo);
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 16) {
|
||||
*p_mask = load128(p_mask_arr[16] + 16);
|
||||
assert(start_offset - start <= 16);
|
||||
*p_mask = loadu128(p_mask_arr[16 - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
return loadu128(ptr);
|
||||
}
|
||||
*p_mask = load128(p_mask_arr[avail] + 16);
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else {
|
||||
} else { // start zone
|
||||
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||
MIN(len_history, nMasks - 1));
|
||||
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||
uintptr_t i;
|
||||
for (i = start - need; ptr + i < lo; i++) {
|
||||
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
|
||||
for (i = start - need; i < start; i++) {
|
||||
u.val8[i] = buf_history[len_history - (start - i)];
|
||||
}
|
||||
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
||||
*p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
|
||||
copy_start = i;
|
||||
copy_len = end - i;
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu128(p_mask_arr[end - start - start_offset]
|
||||
+ 16 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
|
||||
// Runt block from the buffer.
|
||||
@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
return u.val128;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
/*
|
||||
* \brief Copy a block of [0,31] bytes efficiently.
|
||||
*
|
||||
* This function is a workaround intended to stop some compilers from
|
||||
* synthesizing a memcpy function call out of the copy of a small number of
|
||||
* bytes that we do in vectoredLoad256.
|
||||
*/
|
||||
static really_inline
|
||||
void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
|
||||
switch (len) {
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
*dst = *src;
|
||||
break;
|
||||
case 2:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
break;
|
||||
case 3:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
dst[2] = src[2];
|
||||
break;
|
||||
case 4:
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
/* Perform copy with two overlapping 4-byte chunks. */
|
||||
unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 8:
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
/* Perform copy with two overlapping 8-byte chunks. */
|
||||
unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
case 16:
|
||||
storeu128(dst, loadu128(src));
|
||||
break;
|
||||
default:
|
||||
/* Perform copy with two overlapping 16-byte chunks. */
|
||||
assert(len < 32);
|
||||
storeu128(dst + len - 16, loadu128(src + len - 16));
|
||||
storeu128(dst, loadu128(src));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
|
||||
// m byte 0xff in the beginning, followed by n byte 0x00,
|
||||
// then followed by the rest bytes 0xff.
|
||||
// ptr >= lo:
|
||||
// no history.
|
||||
// for end/short zone, ptr==lo and start_offset==0
|
||||
// for start zone, see below
|
||||
// lo ptr hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// start 0 start+offset end(<=32)
|
||||
// p_mask ffff..ff0000...........00ffff..........
|
||||
// ptr < lo:
|
||||
// only start zone.
|
||||
// history
|
||||
// ptr lo hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=32)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
static really_inline
|
||||
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
u8 val8[32];
|
||||
m256 val256;
|
||||
} u;
|
||||
u.val256 = zeroes256();
|
||||
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
|
||||
if (ptr >= lo) { // short/end/start zone
|
||||
uintptr_t start = (uintptr_t)(ptr - lo);
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 32) {
|
||||
assert(start_offset - start <= 32);
|
||||
*p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
return loadu256(ptr);
|
||||
}
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else { //start zone
|
||||
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||
MIN(len_history, nMasks - 1));
|
||||
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||
uintptr_t i;
|
||||
for (i = start - need; i < start; i++) {
|
||||
u.val8[i] = buf_history[len_history - (start - i)];
|
||||
}
|
||||
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
|
||||
+ 32 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
|
||||
// Runt block from the buffer.
|
||||
copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
|
||||
|
||||
return u.val256;
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
static really_inline
|
||||
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
|
||||
CautionReason reason) {
|
||||
@ -196,53 +355,17 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
|
||||
confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
|
||||
last_match, confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const m128 *getMaskBase(const struct Teddy *teddy) {
|
||||
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const u64a *)((const u8 *)getMaskBase(teddy)
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u32 *getConfBase(const struct Teddy *teddy) {
|
||||
return (const u32 *)((const u8 *)teddy + teddy->confOffset);
|
||||
|
Loading…
x
Reference in New Issue
Block a user