teddy: apply poison mask after prep_conf_ work

This simplifies the code, and removes all the all-ones p_mask uses,
which we were otherwise trusting the optimizer to remove.
This commit is contained in:
Justin Viiret 2016-07-18 11:32:18 +10:00 committed by Matthew Barr
parent d574557200
commit 3d9a60d023
2 changed files with 148 additions and 147 deletions

View File

@ -125,36 +125,34 @@ do { \
#endif #endif
static really_inline static really_inline
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) { m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
m128 mask = set16x8(0xf); m128 mask = set16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
return and128(and128(pshufb(maskBase[0*2], lo), return and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi));
pshufb(maskBase[0*2+1], hi)), p_mask);
} }
static really_inline static really_inline
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask, m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
m128 val) {
m128 mask = set16x8(0xf); m128 mask = set16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, p_mask, val); m128 r = prep_conf_teddy_m1(maskBase, val);
m128 res_1 = and128(pshufb(maskBase[1*2], lo), m128 res_1 = and128(pshufb(maskBase[1*2], lo),
pshufb(maskBase[1*2+1], hi)); pshufb(maskBase[1*2+1], hi));
m128 res_shifted_1 = palignr(res_1, *old_1, 16-1); m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
*old_1 = res_1; *old_1 = res_1;
return and128(and128(r, p_mask), res_shifted_1); return and128(r, res_shifted_1);
} }
static really_inline static really_inline
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 p_mask, m128 val) { m128 val) {
m128 mask = set16x8(0xf); m128 mask = set16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val); m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
m128 res_2 = and128(pshufb(maskBase[2*2], lo), m128 res_2 = and128(pshufb(maskBase[2*2], lo),
pshufb(maskBase[2*2+1], hi)); pshufb(maskBase[2*2+1], hi));
@ -165,11 +163,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
static really_inline static really_inline
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 *old_3, m128 p_mask, m128 val) { m128 *old_3, m128 val) {
m128 mask = set16x8(0xf); m128 mask = set16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
m128 res_3 = and128(pshufb(maskBase[3*2], lo), m128 res_3 = and128(pshufb(maskBase[3*2], lo),
pshufb(maskBase[3*2+1], hi)); pshufb(maskBase[3*2+1], hi));
@ -201,13 +199,14 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16; ptr += 16;
} }
@ -215,9 +214,9 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16)); m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
} }
@ -225,7 +224,8 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
} }
@ -255,13 +255,14 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -269,9 +270,9 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16)); m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -279,7 +280,8 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
} }
@ -310,14 +312,14 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -325,11 +327,9 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -337,7 +337,8 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -368,14 +369,14 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -383,11 +384,9 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -395,7 +394,8 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
} }
@ -428,14 +428,15 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr)); load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -444,10 +445,10 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr)); load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr + 16)); load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -455,8 +456,8 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
p_mask, val_0); r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -489,14 +490,15 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr)); load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -505,10 +507,10 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr)); load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones128(), load128(ptr + 16)); load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -516,8 +518,8 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
m128 p_mask; m128 p_mask;
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
p_mask, val_0); r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
} }
@ -551,14 +553,15 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr)); &res_old_3, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -567,10 +570,10 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr)); &res_old_3, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr + 16)); &res_old_3, load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -579,7 +582,8 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -613,14 +617,15 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr)); &res_old_3, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -629,10 +634,10 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr)); &res_old_3, load128(ptr));
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones128(), load128(ptr + 16)); &res_old_3, load128(ptr + 16));
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -641,7 +646,8 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and128(r_0, p_mask);
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
} }

View File

@ -407,36 +407,35 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
} }
static really_inline static really_inline
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) { m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
m256 mask = set32x8(0xf); m256 mask = set32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
return and256(and256(vpshufb(maskBase[0*2], lo), return and256(vpshufb(maskBase[0*2], lo),
vpshufb(maskBase[0*2+1], hi)), p_mask); vpshufb(maskBase[0*2+1], hi));
} }
static really_inline static really_inline
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask, m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
m256 val) {
m256 mask = set32x8(0xf); m256 mask = set32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val); m256 r = prep_conf_fat_teddy_m1(maskBase, val);
m256 res_1 = and256(vpshufb(maskBase[1*2], lo), m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
vpshufb(maskBase[1*2+1], hi)); vpshufb(maskBase[1*2+1], hi));
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1); m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
*old_1 = res_1; *old_1 = res_1;
return and256(and256(r, p_mask), res_shifted_1); return and256(r, res_shifted_1);
} }
static really_inline static really_inline
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 p_mask, m256 val) { m256 val) {
m256 mask = set32x8(0xf); m256 mask = set32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
m256 res_2 = and256(vpshufb(maskBase[2*2], lo), m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
vpshufb(maskBase[2*2+1], hi)); vpshufb(maskBase[2*2+1], hi));
@ -447,11 +446,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
static really_inline static really_inline
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 *old_3, m256 p_mask, m256 val) { m256 *old_3, m256 val) {
m256 mask = set32x8(0xf); m256 mask = set32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
m256 res_3 = and256(vpshufb(maskBase[3*2], lo), m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
vpshufb(maskBase[3*2+1], hi)); vpshufb(maskBase[3*2+1], hi));
@ -461,12 +460,10 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
} }
static really_inline static really_inline
m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi, m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) {
m256 p_mask) {
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi)); return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
return and256(res, p_mask);
} }
static really_inline static really_inline
@ -503,13 +500,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
ptr += 16; ptr += 16;
} }
@ -517,10 +515,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(), m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
} }
@ -528,7 +525,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
} }
@ -558,13 +556,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -572,10 +571,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(), m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -583,7 +581,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 1); a->buf_history, a->len_history, 1);
m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
} }
@ -614,14 +613,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -629,10 +628,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
load2x128(ptr + 16)); load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -641,7 +639,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -672,25 +671,24 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
load2x128(ptr + 16)); load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -699,7 +697,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 2); a->buf_history, a->len_history, 2);
m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
} }
@ -732,14 +731,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr)); load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -748,10 +748,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr)); load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr + 16)); load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -760,7 +760,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -793,14 +794,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr)); load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -809,10 +811,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr)); load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
ones256(), load2x128(ptr + 16)); load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -821,7 +823,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 3); a->buf_history, a->len_history, 3);
m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
p_mask, val_0); val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
} }
@ -855,15 +858,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
ptr += 16; ptr += 16;
} }
@ -872,12 +875,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr + 16));
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
} }
@ -886,7 +887,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
} }
@ -920,15 +922,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
if (ptr + 16 < buf_end) { if (ptr + 16 < buf_end) {
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
ptr += 16; ptr += 16;
} }
@ -937,12 +939,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
__builtin_prefetch(ptr + (iterBytes*4)); __builtin_prefetch(ptr + (iterBytes*4));
CHECK_FLOOD; CHECK_FLOOD;
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr));
load2x128(ptr));
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, ones256(), &res_old_3, load2x128(ptr + 16));
load2x128(ptr + 16));
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
} }
@ -951,7 +951,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
a->buf_history, a->len_history, 4); a->buf_history, a->len_history, 4);
m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
&res_old_3, p_mask, val_0); &res_old_3, val_0);
r_0 = and256(r_0, p_mask);
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
} }
@ -986,16 +987,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
buf_end, a->buf_history, a->len_history); buf_end, a->buf_history, a->len_history);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
p_mask); res_0 = and256(res_0, p_mask);
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
ptr += 32; ptr += 32;
} }
if (ptr + 32 < buf_end) { if (ptr + 32 < buf_end) {
m256 val_0 = load256(ptr + 0); m256 val_0 = load256(ptr + 0);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
ptr += 32; ptr += 32;
} }
@ -1005,13 +1005,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
CHECK_FLOOD; CHECK_FLOOD;
m256 val_0 = load256(ptr + 0); m256 val_0 = load256(ptr + 0);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
m256 val_1 = load256(ptr + 32); m256 val_1 = load256(ptr + 32);
m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi, m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
} }
@ -1019,8 +1017,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
buf_end, a->buf_history, a->len_history); buf_end, a->buf_history, a->len_history);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
p_mask); res_0 = and256(res_0, p_mask);
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
} }
@ -1055,16 +1053,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
buf_end, a->buf_history, a->len_history); buf_end, a->buf_history, a->len_history);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
p_mask); res_0 = and256(res_0, p_mask);
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
ptr += 32; ptr += 32;
} }
if (ptr + 32 < buf_end) { if (ptr + 32 < buf_end) {
m256 val_0 = load256(ptr + 0); m256 val_0 = load256(ptr + 0);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
ptr += 32; ptr += 32;
} }
@ -1074,13 +1071,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
CHECK_FLOOD; CHECK_FLOOD;
m256 val_0 = load256(ptr + 0); m256 val_0 = load256(ptr + 0);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
m256 val_1 = load256(ptr + 32); m256 val_1 = load256(ptr + 32);
m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi, m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
ones256());
CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy); CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
} }
@ -1088,8 +1083,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
m256 p_mask; m256 p_mask;
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
buf_end, a->buf_history, a->len_history); buf_end, a->buf_history, a->len_history);
m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
p_mask); res_0 = and256(res_0, p_mask);
CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
} }