diff --git a/src/fdr/fdr_confirm.h b/src/fdr/fdr_confirm.h index e160b96d..d975747e 100644 --- a/src/fdr/fdr_confirm.h +++ b/src/fdr/fdr_confirm.h @@ -78,12 +78,8 @@ struct LitInfo { struct FDRConfirm { CONF_TYPE andmsk; CONF_TYPE mult; - u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID - u32 flags; // sole meaning is 'non-zero means no-confirm' (that is all) + u32 nBits; hwlm_group_t groups; - u32 soleLitSize; - u32 soleLitCmp; - u32 soleLitMsk; }; static really_inline diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp index 616ff86e..a6eee4cf 100644 --- a/src/fdr/fdr_confirm_compile.cpp +++ b/src/fdr/fdr_confirm_compile.cpp @@ -130,7 +130,7 @@ void fillLitInfo(const vector &lits, vector &tmpLitInfo, static bytecode_ptr getFDRConfirm(const vector &lits, - bool make_small, bool make_confirm) { + bool make_small) { // Every literal must fit within CONF_TYPE. assert(all_of_in(lits, [](const hwlmLiteral &lit) { return lit.s.size() <= sizeof(CONF_TYPE); @@ -153,42 +153,6 @@ bytecode_ptr getFDRConfirm(const vector &lits, } CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL; - u32 flags = 0; - // we use next three variables for 'confirmless' case to speed-up - // confirmation process - u32 soleLitSize = 0; - u32 soleLitCmp = 0; - u32 soleLitMsk = 0; - - if (!make_confirm) { - flags = FDRC_FLAG_NO_CONFIRM; - if (lits[0].noruns) { - // messy - need to clean this up later as flags is sorta kinda - // obsoleted - flags |= FDRC_FLAG_NOREPEAT; - } - mult = 0; - soleLitSize = lits[0].s.size() - 1; - // we can get to this point only in confirmless case; - // it means that we have only one literal per FDRConfirm (no packing), - // with no literal mask and size of literal is less or equal - // to the number of masks of Teddy engine; - // maximum number of masks for Teddy is 4, so the size of - // literal is definitely less or equal to size of u32 - assert(lits[0].s.size() <= sizeof(u32)); - for (u32 i = 0; i < lits[0].s.size(); i++) { - u32 shiftLoc = (sizeof(u32) - i - 1) * 8; - u8 c = lits[0].s[lits[0].s.size() - i - 1]; - if (lits[0].nocase && ourisalpha(c)) { - soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc; - soleLitMsk |= (u32)CASE_CLEAR << shiftLoc; - } - else { - soleLitCmp |= (u32)c << shiftLoc; - soleLitMsk |= (u32)0xff << shiftLoc; - } - } - } // we can walk the vector and assign elements from the vectors to a // map by hash value @@ -276,11 +240,7 @@ bytecode_ptr getFDRConfirm(const vector &lits, fdrc->andmsk = andmsk; fdrc->mult = mult; - fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits; - fdrc->flags = flags; - fdrc->soleLitSize = soleLitSize; - fdrc->soleLitCmp = soleLitCmp; - fdrc->soleLitMsk = soleLitMsk; + fdrc->nBits = nBits; fdrc->groups = gm; @@ -334,12 +294,8 @@ setupFullConfs(const vector &lits, const EngineDescription &eng, map> &bucketToLits, bool make_small) { - bool makeConfirm = true; unique_ptr teddyDescr = getTeddyDescription(eng.getID()); - if (teddyDescr) { - makeConfirm = teddyDescr->needConfirm(lits); - } BC2CONF bc2Conf; u32 totalConfirmSize = 0; @@ -351,7 +307,7 @@ setupFullConfs(const vector &lits, } DEBUG_PRINTF("b %d sz %zu\n", b, vl.size()); - auto fc = getFDRConfirm(vl, make_small, makeConfirm); + auto fc = getFDRConfirm(vl, make_small); totalConfirmSize += fc.size(); bc2Conf.emplace(b, move(fc)); } diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h index ea644bfb..d75408f4 100644 --- a/src/fdr/fdr_confirm_runtime.h +++ b/src/fdr/fdr_confirm_runtime.h @@ -43,11 +43,12 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a size_t i, hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) { assert(i < a->len); + assert(i >= a->start_offset); assert(ISALIGNED(fdrc)); const u8 * buf = a->buf; u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult, - fdrc->nBitsOrSoleID); + fdrc->nBits); u32 start = getConfirmLitIndex(fdrc)[c]; if (likely(!start)) { return; @@ -94,80 +95,4 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a } while (oldNext); } -// 'light-weight' confirmation function which is used by 1-mask Teddy; -// in the 'confirmless' case it simply calls callback function, -// otherwise it calls 'confWithBit' function for the full confirmation procedure -static really_inline -void confWithBit1(const struct FDRConfirm *fdrc, - const struct FDR_Runtime_Args *a, size_t i, - hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) { - assert(i < a->len); - assert(ISALIGNED(fdrc)); - - if (unlikely(fdrc->mult)) { - confWithBit(fdrc, a, i, control, last_match, conf_key); - return; - } else { - u32 id = fdrc->nBitsOrSoleID; - - if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) { - return; - } - *last_match = id; - *control = a->cb(i, i, id, a->ctxt); - } -} - -// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy -// In the 'confirmless' case it makes fast 32-bit comparison, -// otherwise it calls 'confWithBit' function for the full confirmation procedure -static really_inline -void confWithBitMany(const struct FDRConfirm *fdrc, - const struct FDR_Runtime_Args *a, size_t i, CautionReason r, - hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) { - assert(i < a->len); - assert(ISALIGNED(fdrc)); - - if (i < a->start_offset) { - return; - } - - if (unlikely(fdrc->mult)) { - confWithBit(fdrc, a, i, control, last_match, conf_key); - return; - } else { - const u32 id = fdrc->nBitsOrSoleID; - const u32 len = fdrc->soleLitSize; - - if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) { - return; - } - - if (r == VECTORING && len > i - a->start_offset) { - if (len > i + a->len_history) { - return; - } - - u32 cmp = (u32)a->buf[i] << 24; - - if (len <= i) { - for (u32 j = 1; j <= len; j++) { - cmp |= (u32)a->buf[i - j] << (24 - (j * 8)); - } - } else { - for (u32 j = 1; j <= i; j++) { - cmp |= (u32)a->buf[i - j] << (24 - (j * 8)); - } - cmp |= (u32)(a->histBytes >> (40 + i * 8)); - } - - if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) { - return; - } - } - *last_match = id; - *control = a->cb(i - len, i, id, a->ctxt); - } -} - #endif diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 636c741b..da5096a0 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -38,90 +38,294 @@ #include "util/simd_utils.h" const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} }; +#if defined(__AVX2__) // reinforced teddy + #ifdef ARCH_64_BIT #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ - if (unlikely(isnonzero128(var))) { \ + if (unlikely(diff256(var, ones256()))) { \ + m128 lo = movdq_lo(var); \ + m128 hi = movdq_hi(var); \ + u64a part1 = movq(lo); \ + u64a part2 = movq(rshiftbyte_m128(lo, 8)); \ + u64a part3 = movq(hi); \ + u64a part4 = movq(rshiftbyte_m128(hi, 8)); \ + if (unlikely(part1 != ones_u64a)) { \ + part1 = ~part1; \ + conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part2 != ones_u64a)) { \ + part2 = ~part2; \ + conf_fn(&part2, bucket, offset + 8, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part3 != ones_u64a)) { \ + part3 = ~part3; \ + conf_fn(&part3, bucket, offset + 16, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part4 != ones_u64a)) { \ + part4 = ~part4; \ + conf_fn(&part4, bucket, offset + 24, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while(0) +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff256(var, ones256()))) { \ + m128 lo = movdq_lo(var); \ + m128 hi = movdq_hi(var); \ + u32 part1 = movd(lo); \ + u32 part2 = movd(rshiftbyte_m128(lo, 4)); \ + u32 part3 = movd(rshiftbyte_m128(lo, 8)); \ + u32 part4 = movd(rshiftbyte_m128(lo, 12)); \ + u32 part5 = movd(hi); \ + u32 part6 = movd(rshiftbyte_m128(hi, 4)); \ + u32 part7 = movd(rshiftbyte_m128(hi, 8)); \ + u32 part8 = movd(rshiftbyte_m128(hi, 12)); \ + if (unlikely(part1 != ones_u32)) { \ + part1 = ~part1; \ + conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part2 != ones_u32)) { \ + part2 = ~part2; \ + conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part3 != ones_u32)) { \ + part3 = ~part3; \ + conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part4 != ones_u32)) { \ + part4 = ~part4; \ + conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part5 != ones_u32)) { \ + part5 = ~part5; \ + conf_fn(&part5, bucket, offset + 16, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part6 != ones_u32)) { \ + part6 = ~part6; \ + conf_fn(&part6, bucket, offset + 20, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part7 != ones_u32)) { \ + part7 = ~part7; \ + conf_fn(&part7, bucket, offset + 24, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part8 != ones_u32)) { \ + part8 = ~part8; \ + conf_fn(&part8, bucket, offset + 28, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while(0) +#endif + +#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \ + m256 lo = and256(val, *lo_mask); \ + m256 hi = and256(rshift64_m256(val, 4), *lo_mask) + +#define PREP_SHUF_MASK \ + PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ + *c_128 = *(ptr + 15); \ + m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ + *c_0 = *(ptr + 31) + +#define SHIFT_OR_M1 \ + or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi)) + +#define SHIFT_OR_M2 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo), \ + pshufb_m256(dup_mask[3], hi)), \ + 1), SHIFT_OR_M1) + +#define SHIFT_OR_M3 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo), \ + pshufb_m256(dup_mask[5], hi)), \ + 2), SHIFT_OR_M2) + +#define SHIFT_OR_M4 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo), \ + pshufb_m256(dup_mask[7], hi)), \ + 3), SHIFT_OR_M3) + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M1; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M2; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M3; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M4; +} + +static really_inline +m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M1, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M2, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M3, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M4, r_msk); +} + +#else // not defined __AVX2__ + +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff128(var, ones128()))) { \ u64a lo = movq(var); \ u64a hi = movq(rshiftbyte_m128(var, 8)); \ - if (unlikely(lo)) { \ + if (unlikely(lo != ones_u64a)) { \ + lo = ~lo; \ conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(hi)) { \ + if (unlikely(hi != ones_u64a)) { \ + hi = ~hi; \ conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ } \ -} while (0); +} while(0) #else #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ - if (unlikely(isnonzero128(var))) { \ + if (unlikely(diff128(var, ones128()))) { \ u32 part1 = movd(var); \ u32 part2 = movd(rshiftbyte_m128(var, 4)); \ u32 part3 = movd(rshiftbyte_m128(var, 8)); \ u32 part4 = movd(rshiftbyte_m128(var, 12)); \ - if (unlikely(part1)) { \ + if (unlikely(part1 != ones_u32)) { \ + part1 = ~part1; \ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part2)) { \ + if (unlikely(part2 != ones_u32)) { \ + part2 = ~part2; \ conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part3)) { \ + if (unlikely(part3 != ones_u32)) { \ + part3 = ~part3; \ conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part4)) { \ + if (unlikely(part4 != ones_u32)) { \ + part4 = ~part4; \ conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ } \ -} while (0); +} while(0) #endif static really_inline @@ -129,8 +333,8 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { m128 mask = set16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); - return and128(pshufb_m128(maskBase[0 * 2], lo), - pshufb_m128(maskBase[0 * 2 + 1], hi)); + return or128(pshufb_m128(maskBase[0 * 2], lo), + pshufb_m128(maskBase[0 * 2 + 1], hi)); } static really_inline @@ -140,11 +344,11 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m1(maskBase, val); - m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo), - pshufb_m128(maskBase[1*2+1], hi)); - m128 res_shifted_1 = palignr(res_1, *old_1, 16-1); + m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo), + pshufb_m128(maskBase[1 * 2 + 1], hi)); + m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1); *old_1 = res_1; - return and128(r, res_shifted_1); + return or128(r, res_shifted_1); } static really_inline @@ -155,11 +359,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m2(maskBase, old_1, val); - m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo), - pshufb_m128(maskBase[2*2+1], hi)); - m128 res_shifted_2 = palignr(res_2, *old_2, 16-2); + m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo), + pshufb_m128(maskBase[2 * 2 + 1], hi)); + m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2); *old_2 = res_2; - return and128(r, res_shifted_2); + return or128(r, res_shifted_2); } static really_inline @@ -170,487 +374,260 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); - m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo), - pshufb_m128(maskBase[3*2+1], hi)); - m128 res_shifted_3 = palignr(res_3, *old_3, 16-3); + m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo), + pshufb_m128(maskBase[3 * 2 + 1], hi)); + m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3); *old_3 = res_3; - return and128(r, res_shifted_3); + return or128(r, res_shifted_3); } +#endif // __AVX2__ + +#if defined(__AVX2__) // reinforced teddy + +#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \ + prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) + +#define PREP_CONF_FN(ptr, n) \ + prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) + +#define PREPARE_MASKS_1 \ + dup_mask[0] = set2x128(maskBase[0]); \ + dup_mask[1] = set2x128(maskBase[1]); + +#define PREPARE_MASKS_2 \ + PREPARE_MASKS_1 \ + dup_mask[2] = set2x128(maskBase[2]); \ + dup_mask[3] = set2x128(maskBase[3]); + +#define PREPARE_MASKS_3 \ + PREPARE_MASKS_2 \ + dup_mask[4] = set2x128(maskBase[4]); \ + dup_mask[5] = set2x128(maskBase[5]); + +#define PREPARE_MASKS_4 \ + PREPARE_MASKS_3 \ + dup_mask[6] = set2x128(maskBase[6]); \ + dup_mask[7] = set2x128(maskBase[7]); + +#define PREPARE_MASKS(n) \ + m256 lo_mask = set32x8(0xf); \ + m256 dup_mask[n * 2]; \ + PREPARE_MASKS_##n + +#else // not defined __AVX2__ + +#define FDR_EXEC_TEDDY_RES_OLD_1 + +#define FDR_EXEC_TEDDY_RES_OLD_2 \ + m128 res_old_1 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD_3 \ + m128 res_old_1 = zeroes128(); \ + m128 res_old_2 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD_4 \ + m128 res_old_1 = zeroes128(); \ + m128 res_old_2 = zeroes128(); \ + m128 res_old_3 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n + +#define PREP_CONF_FN_1(mask_base, val) \ + prep_conf_teddy_m1(mask_base, val) + +#define PREP_CONF_FN_2(mask_base, val) \ + prep_conf_teddy_m2(mask_base, &res_old_1, val) + +#define PREP_CONF_FN_3(mask_base, val) \ + prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val) + +#define PREP_CONF_FN_4(mask_base, val) \ + prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) + +#define PREP_CONF_FN(mask_base, val, n) \ + PREP_CONF_FN_##n(mask_base, val) +#endif // __AVX2__ + + +#if defined(__AVX2__) // reinforced teddy +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = (u32)-1; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 64; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + PREPARE_MASKS(n_msk); \ + const u32 *confBase = getConfBase(teddy); \ + \ + const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \ + u32 c_0 = 0x100; \ + u32 c_128 = 0x100; \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 32; \ + m256 p_mask; \ + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + c_0 = *(ptr + 31); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 32; \ + } \ + \ + if (ptr + 32 <= buf_end) { \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 32; \ + } \ + \ + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \ + CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 32 <= buf_end) { \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 32; \ + } \ + \ + assert(ptr + 32 > buf_end); \ + if (ptr < buf_end) { \ + m256 p_mask; \ + m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) +#else // not defined __AVX2__ +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = (u32)-1; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 32; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + const u32 *confBase = getConfBase(teddy); \ + \ + FDR_EXEC_TEDDY_RES_OLD(n_msk); \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 16; \ + m128 p_mask; \ + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ + r_0 = or128(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk); \ + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 16; \ + } \ + \ + assert(ptr + 16 > buf_end); \ + if (ptr < buf_end) { \ + m128 p_mask; \ + m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ + r_0 = or128(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) +#endif // __AVX2__ + hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m128 r_0 = prep_conf_teddy_m1(maskBase, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); - m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m128 r_0 = prep_conf_teddy_m1(maskBase, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m128 r_0 = prep_conf_teddy_m1(maskBase, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m128 r_0 = prep_conf_teddy_m1(maskBase, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - m128 res_old_2 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - m128 res_old_2 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, - load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - m128 res_old_2 = ones128(); - m128 res_old_3 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy); - - m128 res_old_1 = ones128(); - m128 res_old_2 = ones128(); - m128 res_old_3 = ones128(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr)); - CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load128(ptr + 16)); - CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m128 p_mask; - m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and128(r_0, p_mask); - CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); } diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index 89117b0b..11ea0f8e 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -40,10 +40,79 @@ #if defined(HAVE_AVX2) +const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} +}; + #ifdef ARCH_64_BIT #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ - if (unlikely(isnonzero256(var))) { \ + if (unlikely(diff256(var, ones256()))) { \ m256 swap = swap128in256(var); \ m256 r = interleave256lo(var, swap); \ u64a part1 = extractlow64from256(r); \ @@ -51,32 +120,36 @@ do { \ r = interleave256hi(var, swap); \ u64a part3 = extractlow64from256(r); \ u64a part4 = extract64from256(r, 1); \ - if (unlikely(part1)) { \ + if (unlikely(part1 != ones_u64a)) { \ + part1 = ~part1; \ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part2)) { \ + if (unlikely(part2 != ones_u64a)) { \ + part2 = ~part2; \ conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part3)) { \ + if (unlikely(part3 != ones_u64a)) { \ + part3 = ~part3; \ conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part4)) { \ + if (unlikely(part4 != ones_u64a)) { \ + part4 = ~part4; \ conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ } \ -} while (0); +} while(0) #else #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ - if (unlikely(isnonzero256(var))) { \ + if (unlikely(diff256(var, ones256()))) { \ m256 swap = swap128in256(var); \ m256 r = interleave256lo(var, swap); \ u32 part1 = extractlow32from256(r); \ @@ -88,56 +161,65 @@ do { \ u32 part6 = extract32from256(r, 1); \ u32 part7 = extract32from256(r, 2); \ u32 part8 = extract32from256(r, 3); \ - if (unlikely(part1)) { \ + if (unlikely(part1 != ones_u32)) { \ + part1 = ~part1; \ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part2)) { \ + if (unlikely(part2 != ones_u32)) { \ + part2 = ~part2; \ conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \ &control, &last_match); \ } \ - if (unlikely(part3)) { \ + if (unlikely(part3 != ones_u32)) { \ + part3 = ~part3; \ conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part4)) { \ + if (unlikely(part4 != ones_u32)) { \ + part4 = ~part4; \ conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part5)) { \ + if (unlikely(part5 != ones_u32)) { \ + part5 = ~part5; \ conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part6)) { \ + if (unlikely(part6 != ones_u32)) { \ + part6 = ~part6; \ conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part7)) { \ + if (unlikely(part7 != ones_u32)) { \ + part7 = ~part7; \ conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ - if (unlikely(part8)) { \ + if (unlikely(part8 != ones_u32)) { \ + part8 = ~part8; \ conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \ &control, &last_match); \ CHECK_HWLM_TERMINATE_MATCHING; \ } \ } \ -} while (0); +} while(0) #endif static really_inline -m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, +m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, const u8 *buf_history, size_t len_history, const u32 nMasks) { m128 p_mask128; - m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, - len_history, nMasks)); + m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, + buf_history, len_history, nMasks)); *p_mask = set2x128(p_mask128); return ret; } @@ -147,8 +229,8 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { m256 mask = set32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); - return and256(pshufb_m256(maskBase[0*2], lo), - pshufb_m256(maskBase[0*2+1], hi)); + return or256(pshufb_m256(maskBase[0 * 2], lo), + pshufb_m256(maskBase[0 * 2 + 1], hi)); } static really_inline @@ -158,11 +240,11 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m1(maskBase, val); - m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo), - pshufb_m256(maskBase[1*2+1], hi)); - m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1); + m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo), + pshufb_m256(maskBase[1 * 2 + 1], hi)); + m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1); *old_1 = res_1; - return and256(r, res_shifted_1); + return or256(r, res_shifted_1); } static really_inline @@ -173,11 +255,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); - m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo), - pshufb_m256(maskBase[2*2+1], hi)); - m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2); + m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo), + pshufb_m256(maskBase[2 * 2 + 1], hi)); + m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2); *old_2 = res_2; - return and256(r, res_shifted_2); + return or256(r, res_shifted_2); } static really_inline @@ -188,11 +270,11 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); - m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo), - pshufb_m256(maskBase[3*2+1], hi)); - m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3); + m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo), + pshufb_m256(maskBase[3 * 2 + 1], hi)); + m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3); *old_3 = res_3; - return and256(r, res_shifted_3); + return or256(r, res_shifted_3); } static really_inline @@ -200,486 +282,151 @@ const m256 *getMaskBase_avx2(const struct Teddy *teddy) { return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); } +#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \ +do { \ +} while(0) + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \ + m256 res_old_1 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \ + m256 res_old_1 = zeroes256(); \ + m256 res_old_2 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \ + m256 res_old_1 = zeroes256(); \ + m256 res_old_2 = zeroes256(); \ + m256 res_old_3 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n + +#define PREP_CONF_FAT_FN_1(mask_base, val) \ + prep_conf_fat_teddy_m1(mask_base, val) + +#define PREP_CONF_FAT_FN_2(mask_base, val) \ + prep_conf_fat_teddy_m2(mask_base, &res_old_1, val) + +#define PREP_CONF_FAT_FN_3(mask_base, val) \ + prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val) + +#define PREP_CONF_FAT_FN_4(mask_base, val) \ + prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) + +#define PREP_CONF_FAT_FN(mask_base, val, n) \ + PREP_CONF_FAT_FN_##n(mask_base, val) + +#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = (u32)-1; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 32; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m256 *maskBase = getMaskBase_avx2(teddy); \ + const u32 *confBase = getConfBase(teddy); \ + \ + FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 16; \ + m256 p_mask; \ + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, \ + n_msk); \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ + m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \ + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 16; \ + } \ + \ + assert(ptr + 16 > buf_end); \ + if (ptr < buf_end) { \ + m256 p_mask; \ + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, \ + n_msk); \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); - m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 1); - m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, - load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, - load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 2); - m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - m256 res_old_2 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - m256 res_old_2 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 3); - m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, - val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - m256 res_old_2 = ones256(); - m256 res_old_3 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); - m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); } hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 32; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m256 *maskBase = getMaskBase_avx2(teddy); - const u32 *confBase = getConfBase(teddy); - - m256 res_old_1 = ones256(); - m256 res_old_2 = ones256(); - m256 res_old_3 = ones256(); - const u8 *mainStart = ROUNDUP_PTR(ptr, 16); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 16; - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - if (ptr + 16 < buf_end) { - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - ptr += 16; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr)); - CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); - m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, load2x128(ptr + 16)); - CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); - } - - for (; ptr < buf_end; ptr += 16) { - m256 p_mask; - m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, - a->buf_history, a->len_history, 4); - m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, - &res_old_3, val_0); - r_0 = and256(r_0, p_mask); - CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); - } - - return HWLM_SUCCESS; + FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); } #endif // HAVE_AVX2 diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp index 663d0483..14f19354 100644 --- a/src/fdr/teddy_compile.cpp +++ b/src/fdr/teddy_compile.cpp @@ -309,74 +309,65 @@ bool TeddyCompiler::pack(map TeddyCompiler::build() { - assert(eng.numMasks <= MAX_NUM_MASKS); +// this entry has all-zero mask to skip reinforcement +#define NO_REINFORCEMENT N_CHARS - if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) { - DEBUG_PRINTF("too many literals: %zu\n", lits.size()); - return nullptr; +// this means every entry in reinforcement table +#define ALL_CHAR_SET N_CHARS + +// each item's reinforcement mask has REINFORCED_MSK_LEN bytes +#define REINFORCED_MSK_LEN 8 + +static +void initReinforcedTable(u8 *reinforcedMsk) { + u64a *mask = (u64a *)reinforcedMsk; + fill_n(mask, N_CHARS, 0x00ffffffffffffffULL); +} + +static +void fillReinforcedMskZero(u8 *reinforcedMsk) { + u8 *mc = reinforcedMsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN; + fill_n(mc, REINFORCED_MSK_LEN, 0x00); +} + +static +void fillReinforcedMsk(u8 *reinforcedMsk, u16 c, u32 j, u8 bmsk) { + assert(j > 0); + if (c == ALL_CHAR_SET) { + for (size_t i = 0; i < N_CHARS; i++) { + u8 *mc = reinforcedMsk + i * REINFORCED_MSK_LEN; + mc[j - 1] &= ~bmsk; + } + } else { + u8 *mc = reinforcedMsk + c * REINFORCED_MSK_LEN; + mc[j - 1] &= ~bmsk; } +} #ifdef TEDDY_DEBUG - for (size_t i = 0; i < lits.size(); i++) { - printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(), - lits[i].nocase ? "caseless" : "caseful"); - for (size_t j = 0; j < lits[i].s.size(); j++) { - printf("%02x", ((u32)lits[i].s[j]) & 0xff); +static +void dumpReinforcedMaskTable(const u8 *msks) { + for (u32 i = 0; i <= N_CHARS; i++) { + printf("0x%02x: ", i); + for (u32 j = 0; j < REINFORCED_MSK_LEN; j++) { + u8 val = msks[i * REINFORCED_MSK_LEN + j]; + for (u32 k = 0; k < 8; k++) { + printf("%s", ((val >> k) & 0x1) ? "1" : "0"); + } + printf(" "); } printf("\n"); } +} #endif - map> bucketToLits; - if (eng.needConfirm(lits)) { - if (!pack(bucketToLits)) { - DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n", - lits.size(), eng.getNumBuckets()); - return nullptr; - } - } else { - for (u32 i = 0; i < lits.size(); i++) { - bucketToLits[i].push_back(i); - } - } - u32 maskWidth = eng.getNumBuckets() / 8; - - size_t headerSize = sizeof(Teddy); - size_t maskLen = eng.numMasks * 16 * 2 * maskWidth; - - auto floodTable = setupFDRFloodControl(lits, eng, grey); - auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); - - // Note: we place each major structure here on a cacheline boundary. - size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) + - ROUNDUP_CL(confirmTable.size()) + floodTable.size(); - - auto fdr = make_zeroed_bytecode_ptr(size, 64); - assert(fdr); // otherwise would have thrown std::bad_alloc - Teddy *teddy = (Teddy *)fdr.get(); // ugly - u8 *teddy_base = (u8 *)teddy; - - // Write header. - teddy->size = size; - teddy->engineID = eng.getID(); - teddy->maxStringLen = verify_u32(maxLen(lits)); - - // Write confirm structures. - u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen); - assert(ISALIGNED_CL(ptr)); - teddy->confOffset = verify_u32(ptr - teddy_base); - memcpy(ptr, confirmTable.get(), confirmTable.size()); - ptr += ROUNDUP_CL(confirmTable.size()); - - // Write flood control structures. - assert(ISALIGNED_CL(ptr)); - teddy->floodOffset = verify_u32(ptr - teddy_base); - memcpy(ptr, floodTable.get(), floodTable.size()); - ptr += floodTable.size(); - - // Write teddy masks. - u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize); +static +void fillNibbleMasks(const map> &bucketToLits, + const vector &lits, + u32 numMasks, u32 maskWidth, size_t maskLen, + u8 *baseMsk) { + memset(baseMsk, 0xff, maskLen); for (const auto &b2l : bucketToLits) { const u32 &bucket_id = b2l.first; @@ -389,7 +380,7 @@ bytecode_ptr TeddyCompiler::build() { const u32 sz = verify_u32(l.s.size()); // fill in masks - for (u32 j = 0; j < eng.numMasks; j++) { + for (u32 j = 0; j < numMasks; j++) { const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8); const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8); const u32 lo_base = msk_id_lo * 16; @@ -399,8 +390,8 @@ bytecode_ptr TeddyCompiler::build() { // locations in these masks with '1' if (j >= sz) { for (u32 n = 0; n < 16; n++) { - baseMsk[lo_base + n] |= bmsk; - baseMsk[hi_base + n] |= bmsk; + baseMsk[lo_base + n] &= ~bmsk; + baseMsk[hi_base + n] &= ~bmsk; } } else { u8 c = l.s[sz - 1 - j]; @@ -419,27 +410,139 @@ bytecode_ptr TeddyCompiler::build() { for (u8 cm = 0; cm < 0x10; cm++) { if ((cm & m_lo) == (cmp_lo & m_lo)) { - baseMsk[lo_base + cm] |= bmsk; + baseMsk[lo_base + cm] &= ~bmsk; } if ((cm & m_hi) == (cmp_hi & m_hi)) { - baseMsk[hi_base + cm] |= bmsk; + baseMsk[hi_base + cm] &= ~bmsk; } } } else { if (l.nocase && ourisalpha(c)) { u32 cmHalfClear = (0xdf >> hiShift) & 0xf; u32 cmHalfSet = (0x20 >> hiShift) & 0xf; - baseMsk[hi_base + (n_hi & cmHalfClear)] |= bmsk; - baseMsk[hi_base + (n_hi | cmHalfSet)] |= bmsk; + baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk; + baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk; } else { - baseMsk[hi_base + n_hi] |= bmsk; + baseMsk[hi_base + n_hi] &= ~bmsk; } - baseMsk[lo_base + n_lo] |= bmsk; + baseMsk[lo_base + n_lo] &= ~bmsk; } } } } } +} + +static +void fillReinforcedTable(const map> &bucketToLits, + const vector &lits, + u8 *reinforcedMsk) { + initReinforcedTable(reinforcedMsk); + + for (const auto &b2l : bucketToLits) { + const u32 &bucket_id = b2l.first; + const vector &ids = b2l.second; + const u8 bmsk = 1U << (bucket_id % 8); + + for (const LiteralIndex &lit_id : ids) { + const hwlmLiteral &l = lits[lit_id]; + DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id); + const u32 sz = verify_u32(l.s.size()); + + // fill in reinforced masks + for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) { + if (sz - 1 < j) { + fillReinforcedMsk(reinforcedMsk, ALL_CHAR_SET, j, bmsk); + } else { + u8 c = l.s[sz - 1 - j]; + if (l.nocase && ourisalpha(c)) { + u8 c_up = c & 0xdf; + fillReinforcedMsk(reinforcedMsk, c_up, j, bmsk); + u8 c_lo = c | 0x20; + fillReinforcedMsk(reinforcedMsk, c_lo, j, bmsk); + } else { + fillReinforcedMsk(reinforcedMsk, c, j, bmsk); + } + } + } + } + } + + fillReinforcedMskZero(reinforcedMsk); +} + +bytecode_ptr TeddyCompiler::build() { + assert(eng.numMasks <= MAX_NUM_MASKS); + + if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) { + DEBUG_PRINTF("too many literals: %zu\n", lits.size()); + return nullptr; + } + +#ifdef TEDDY_DEBUG + for (size_t i = 0; i < lits.size(); i++) { + printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(), + lits[i].nocase ? "caseless" : "caseful"); + for (size_t j = 0; j < lits[i].s.size(); j++) { + printf("%02x", ((u32)lits[i].s[j])&0xff); + } + printf("\n"); + } +#endif + + map> bucketToLits; + if (!pack(bucketToLits)) { + DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n", + lits.size(), eng.getNumBuckets()); + return nullptr; + } + u32 maskWidth = eng.getNumBuckets() / 8; + + size_t headerSize = sizeof(Teddy); + size_t maskLen = eng.numMasks * 16 * 2 * maskWidth; + size_t reinforcedMaskLen = (N_CHARS + 1) * REINFORCED_MSK_LEN; + + auto floodTable = setupFDRFloodControl(lits, eng, grey); + auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); + + // Note: we place each major structure here on a cacheline boundary. + size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) + + ROUNDUP_CL(reinforcedMaskLen) + + ROUNDUP_CL(confirmTable.size()) + floodTable.size(); + + auto fdr = make_zeroed_bytecode_ptr(size, 64); + assert(fdr); // otherwise would have thrown std::bad_alloc + Teddy *teddy = (Teddy *)fdr.get(); // ugly + u8 *teddy_base = (u8 *)teddy; + + // Write header. + teddy->size = size; + teddy->engineID = eng.getID(); + teddy->maxStringLen = verify_u32(maxLen(lits)); + + // Write confirm structures. + u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) + + ROUNDUP_CL(reinforcedMaskLen); + assert(ISALIGNED_CL(ptr)); + teddy->confOffset = verify_u32(ptr - teddy_base); + memcpy(ptr, confirmTable.get(), confirmTable.size()); + ptr += ROUNDUP_CL(confirmTable.size()); + + // Write flood control structures. + assert(ISALIGNED_CL(ptr)); + teddy->floodOffset = verify_u32(ptr - teddy_base); + memcpy(ptr, floodTable.get(), floodTable.size()); + ptr += floodTable.size(); + + // Write teddy masks. + u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize); + fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen, + baseMsk); + + // Write reinforcement masks. + u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen); + fillReinforcedTable(bucketToLits, lits, reinforcedMsk); #ifdef TEDDY_DEBUG for (u32 i = 0; i < eng.numMasks * 2; i++) { @@ -452,6 +555,10 @@ bytecode_ptr TeddyCompiler::build() { } printf("\n"); } + + printf("\n===============================================\n" + "reinforced mask table for low boundary (original)\n\n"); + dumpReinforcedMaskTable(reinforcedMsk); #endif return fdr; diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp index f7559b13..88ae0f53 100644 --- a/src/fdr/teddy_engine_description.cpp +++ b/src/fdr/teddy_engine_description.cpp @@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const { return numMasks; } -bool TeddyEngineDescription::needConfirm(const vector &lits) const { - if (packed || lits.size() > getNumBuckets()) { - return true; - } - for (const auto &lit : lits) { - if (lit.s.size() > numMasks || !lit.msk.empty()) { - return true; - } - } - return false; -} - void getTeddyDescriptions(vector *out) { static const TeddyEngineDef defns[] = { { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false }, diff --git a/src/fdr/teddy_engine_description.h b/src/fdr/teddy_engine_description.h index 3979a5d3..95931613 100644 --- a/src/fdr/teddy_engine_description.h +++ b/src/fdr/teddy_engine_description.h @@ -55,7 +55,6 @@ public: explicit TeddyEngineDescription(const TeddyEngineDef &def); u32 getDefaultFloodSuffixLength() const override; - bool needConfirm(const std::vector &lits) const; }; std::unique_ptr diff --git a/src/fdr/teddy_internal.h b/src/fdr/teddy_internal.h index 359d1e13..d1752452 100644 --- a/src/fdr/teddy_internal.h +++ b/src/fdr/teddy_internal.h @@ -26,6 +26,25 @@ * POSSIBILITY OF SUCH DAMAGE. */ +/* Teddy bytecode layout: + * * |-----| + * * | | struct Teddy + * * |-----| + * * | | teddy masks + * * | | + * * |-----| + * * | | reinforcement mask table + * * | | + * * |-----| + * * | | confirm + * * | | + * * | | + * * |-----| + * * | | flood control + * * | | + * * |-----| + */ + #ifndef TEDDY_INTERNAL_H #define TEDDY_INTERNAL_H diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h index f63df724..c1333964 100644 --- a/src/fdr/teddy_runtime_common.h +++ b/src/fdr/teddy_runtime_common.h @@ -38,8 +38,12 @@ #include "ue2common.h" #include "util/bitutils.h" #include "util/simd_utils.h" +#include "util/uniform_ops.h" extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32]; +#if defined(__AVX2__) +extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64]; +#endif #ifdef ARCH_64_BIT #define TEDDY_CONF_TYPE u64a @@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) { } // Note: p_mask is an output param that initialises a poison mask. +// *p_mask = load128(p_mask_arr[n] + 16 - m) means: +// m byte 0xff in the beginning, followed by n byte 0x00, +// then followed by the rest bytes 0xff. +// ptr >= lo: +// no history. +// for end/short zone, ptr==lo and start_offset==0 +// for start zone, see below +// lo ptr hi hi +// |----------|-------|----------------|............| +// start 0 start+offset end(<=16) +// p_mask ffff..ff0000...........00ffff.......... +// ptr < lo: +// only start zone. +// history +// ptr lo hi hi +// |----------|-------|----------------|............| +// 0 start start+offset end(<=16) +// p_mask ffff.....ffffff..ff0000...........00ffff.......... static really_inline -m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, +m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, const u8 *buf_history, size_t len_history, const u32 nMasks) { union { @@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, uintptr_t copy_start; uintptr_t copy_len; - if (ptr >= lo) { + if (ptr >= lo) { // short/end/start zone + uintptr_t start = (uintptr_t)(ptr - lo); uintptr_t avail = (uintptr_t)(hi - ptr); if (avail >= 16) { - *p_mask = load128(p_mask_arr[16] + 16); + assert(start_offset - start <= 16); + *p_mask = loadu128(p_mask_arr[16 - start_offset + start] + + 16 - start_offset + start); return loadu128(ptr); } - *p_mask = load128(p_mask_arr[avail] + 16); + assert(start_offset - start <= avail); + *p_mask = loadu128(p_mask_arr[avail - start_offset + start] + + 16 - start_offset + start); copy_start = 0; copy_len = avail; - } else { + } else { // start zone uintptr_t need = MIN((uintptr_t)(lo - ptr), MIN(len_history, nMasks - 1)); uintptr_t start = (uintptr_t)(lo - ptr); uintptr_t i; - for (i = start - need; ptr + i < lo; i++) { - u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; + for (i = start - need; i < start; i++) { + u.val8[i] = buf_history[len_history - (start - i)]; } uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); - *p_mask = loadu128(p_mask_arr[end - start] + 16 - start); - copy_start = i; - copy_len = end - i; + assert(start + start_offset <= end); + *p_mask = loadu128(p_mask_arr[end - start - start_offset] + + 16 - start - start_offset); + copy_start = start; + copy_len = end - start; } // Runt block from the buffer. @@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, return u.val128; } +#if defined(__AVX2__) +/* + * \brief Copy a block of [0,31] bytes efficiently. + * + * This function is a workaround intended to stop some compilers from + * synthesizing a memcpy function call out of the copy of a small number of + * bytes that we do in vectoredLoad256. + */ +static really_inline +void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + /* Perform copy with two overlapping 4-byte chunks. */ + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + /* Perform copy with two overlapping 8-byte chunks. */ + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 16: + storeu128(dst, loadu128(src)); + break; + default: + /* Perform copy with two overlapping 16-byte chunks. */ + assert(len < 32); + storeu128(dst + len - 16, loadu128(src + len - 16)); + storeu128(dst, loadu128(src)); + break; + } +} + +// Note: p_mask is an output param that initialises a poison mask. +// *p_mask = load256(p_mask_arr256[n] + 32 - m) means: +// m byte 0xff in the beginning, followed by n byte 0x00, +// then followed by the rest bytes 0xff. +// ptr >= lo: +// no history. +// for end/short zone, ptr==lo and start_offset==0 +// for start zone, see below +// lo ptr hi hi +// |----------|-------|----------------|............| +// start 0 start+offset end(<=32) +// p_mask ffff..ff0000...........00ffff.......... +// ptr < lo: +// only start zone. +// history +// ptr lo hi hi +// |----------|-------|----------------|............| +// 0 start start+offset end(<=32) +// p_mask ffff.....ffffff..ff0000...........00ffff.......... +static really_inline +m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + union { + u8 val8[32]; + m256 val256; + } u; + u.val256 = zeroes256(); + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { // short/end/start zone + uintptr_t start = (uintptr_t)(ptr - lo); + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 32) { + assert(start_offset - start <= 32); + *p_mask = loadu256(p_mask_arr256[32 - start_offset + start] + + 32 - start_offset + start); + return loadu256(ptr); + } + assert(start_offset - start <= avail); + *p_mask = loadu256(p_mask_arr256[avail - start_offset + start] + + 32 - start_offset + start); + copy_start = 0; + copy_len = avail; + } else { //start zone + uintptr_t need = MIN((uintptr_t)(lo - ptr), + MIN(len_history, nMasks - 1)); + uintptr_t start = (uintptr_t)(lo - ptr); + uintptr_t i; + for (i = start - need; i < start; i++) { + u.val8[i] = buf_history[len_history - (start - i)]; + } + uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); + assert(start + start_offset <= end); + *p_mask = loadu256(p_mask_arr256[end - start - start_offset] + + 32 - start - start_offset); + copy_start = start; + copy_len = end - start; + } + + // Runt block from the buffer. + copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); + + return u.val256; +} +#endif // __AVX2__ + static really_inline u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte, CautionReason reason) { @@ -196,53 +355,17 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, } while (unlikely(*conf)); } -static really_inline -void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, - const u32 *confBase, CautionReason reason, - const struct FDR_Runtime_Args *a, const u8 *ptr, - hwlmcb_rv_t *control, u32 *last_match) { - do { - u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); - u32 byte = bit / bucket + offset; - u32 idx = bit % bucket; - u32 cf = confBase[idx]; - const struct FDRConfirm *fdrc = (const struct FDRConfirm *) - ((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) { - continue; - } - u64a confVal = getConfVal(a, ptr, byte, reason); - confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, - confVal); - } while (unlikely(*conf)); -} - -static really_inline -void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, - const u32 *confBase, CautionReason reason, - const struct FDR_Runtime_Args *a, const u8 *ptr, - hwlmcb_rv_t *control, u32 *last_match) { - do { - u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); - u32 byte = bit / bucket + offset; - u32 idx = bit % bucket; - u32 cf = confBase[idx]; - const struct FDRConfirm *fdrc = (const struct FDRConfirm *) - ((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) { - continue; - } - u64a confVal = getConfVal(a, ptr, byte, reason); - confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control, - last_match, confVal); - } while (unlikely(*conf)); -} - static really_inline const m128 *getMaskBase(const struct Teddy *teddy) { return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); } +static really_inline +const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) { + return (const u64a *)((const u8 *)getMaskBase(teddy) + + ROUNDUP_CL(2 * numMask * sizeof(m128))); +} + static really_inline const u32 *getConfBase(const struct Teddy *teddy) { return (const u32 *)((const u8 *)teddy + teddy->confOffset);