mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
AVX512 reinforced teddy.
This commit is contained in:
parent
340773481e
commit
68e08d8e18
641
src/fdr/teddy.c
641
src/fdr/teddy.c
@ -74,7 +74,294 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
|
|||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX2__) // reinforced teddy
|
#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(chunk != ones_u64a)) { \
|
||||||
|
chunk = ~chunk; \
|
||||||
|
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||||
|
&control, &last_match); \
|
||||||
|
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(chunk != ones_u32a)) { \
|
||||||
|
chunk = ~chunk; \
|
||||||
|
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||||
|
&control, &last_match); \
|
||||||
|
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512) // AVX512 reinforced teddy
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(diff512(var, ones512()))) { \
|
||||||
|
m128 p128_0 = extract128from512(var, 0); \
|
||||||
|
m128 p128_1 = extract128from512(var, 1); \
|
||||||
|
m128 p128_2 = extract128from512(var, 2); \
|
||||||
|
m128 p128_3 = extract128from512(var, 3); \
|
||||||
|
u64a part1 = movq(p128_0); \
|
||||||
|
u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \
|
||||||
|
u64a part3 = movq(p128_1); \
|
||||||
|
u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \
|
||||||
|
u64a part5 = movq(p128_2); \
|
||||||
|
u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \
|
||||||
|
u64a part7 = movq(p128_3); \
|
||||||
|
u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \
|
||||||
|
CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
#else
|
||||||
|
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(diff512(var, ones512()))) { \
|
||||||
|
m128 p128_0 = extract128from512(var, 0); \
|
||||||
|
m128 p128_1 = extract128from512(var, 1); \
|
||||||
|
m128 p128_2 = extract128from512(var, 2); \
|
||||||
|
m128 p128_3 = extract128from512(var, 3); \
|
||||||
|
u32 part1 = movd(p128_0); \
|
||||||
|
u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \
|
||||||
|
u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \
|
||||||
|
u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \
|
||||||
|
u32 part5 = movd(p128_1); \
|
||||||
|
u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \
|
||||||
|
u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \
|
||||||
|
u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \
|
||||||
|
u32 part9 = movd(p128_2); \
|
||||||
|
u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \
|
||||||
|
u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \
|
||||||
|
u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \
|
||||||
|
u32 part13 = movd(p128_3); \
|
||||||
|
u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \
|
||||||
|
u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \
|
||||||
|
u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \
|
||||||
|
CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \
|
||||||
|
m512 lo = and512(val, *lo_mask); \
|
||||||
|
m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
|
||||||
|
|
||||||
|
#define PREP_SHUF_MASK \
|
||||||
|
PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr)); \
|
||||||
|
*c_16 = *(ptr + 15); \
|
||||||
|
*c_32 = *(ptr + 31); \
|
||||||
|
*c_48 = *(ptr + 47); \
|
||||||
|
m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
|
||||||
|
0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
|
||||||
|
*c_0 = *(ptr + 63)
|
||||||
|
|
||||||
|
#define SHIFT_OR_M1 \
|
||||||
|
or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
|
||||||
|
|
||||||
|
#define SHIFT_OR_M2 \
|
||||||
|
or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \
|
||||||
|
pshufb_m512(dup_mask[3], hi)), \
|
||||||
|
1), SHIFT_OR_M1)
|
||||||
|
|
||||||
|
#define SHIFT_OR_M3 \
|
||||||
|
or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \
|
||||||
|
pshufb_m512(dup_mask[5], hi)), \
|
||||||
|
2), SHIFT_OR_M2)
|
||||||
|
|
||||||
|
#define SHIFT_OR_M4 \
|
||||||
|
or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \
|
||||||
|
pshufb_m512(dup_mask[7], hi)), \
|
||||||
|
3), SHIFT_OR_M3)
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
|
||||||
|
const m512 *dup_mask,
|
||||||
|
const m512 val) {
|
||||||
|
PREP_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||||
|
return SHIFT_OR_M1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
|
||||||
|
const m512 *dup_mask,
|
||||||
|
const m512 val) {
|
||||||
|
PREP_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||||
|
return SHIFT_OR_M2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
|
||||||
|
const m512 *dup_mask,
|
||||||
|
const m512 val) {
|
||||||
|
PREP_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||||
|
return SHIFT_OR_M3;
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
|
||||||
|
const m512 *dup_mask,
|
||||||
|
const m512 val) {
|
||||||
|
PREP_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||||
|
return SHIFT_OR_M4;
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||||
|
PREP_SHUF_MASK;
|
||||||
|
return or512(SHIFT_OR_M1, r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||||
|
PREP_SHUF_MASK;
|
||||||
|
return or512(SHIFT_OR_M2, r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||||
|
PREP_SHUF_MASK;
|
||||||
|
return or512(SHIFT_OR_M3, r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
|
||||||
|
const u8 *ptr, const u64a *r_msk_base,
|
||||||
|
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||||
|
PREP_SHUF_MASK;
|
||||||
|
return or512(SHIFT_OR_M4, r_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
|
||||||
|
prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN(ptr, n) \
|
||||||
|
prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, \
|
||||||
|
&c_0, &c_16, &c_32, &c_48)
|
||||||
|
|
||||||
|
#define PREPARE_MASKS_1 \
|
||||||
|
dup_mask[0] = set4x128(maskBase[0]); \
|
||||||
|
dup_mask[1] = set4x128(maskBase[1]);
|
||||||
|
|
||||||
|
#define PREPARE_MASKS_2 \
|
||||||
|
PREPARE_MASKS_1 \
|
||||||
|
dup_mask[2] = set4x128(maskBase[2]); \
|
||||||
|
dup_mask[3] = set4x128(maskBase[3]);
|
||||||
|
|
||||||
|
#define PREPARE_MASKS_3 \
|
||||||
|
PREPARE_MASKS_2 \
|
||||||
|
dup_mask[4] = set4x128(maskBase[4]); \
|
||||||
|
dup_mask[5] = set4x128(maskBase[5]);
|
||||||
|
|
||||||
|
#define PREPARE_MASKS_4 \
|
||||||
|
PREPARE_MASKS_3 \
|
||||||
|
dup_mask[6] = set4x128(maskBase[6]); \
|
||||||
|
dup_mask[7] = set4x128(maskBase[7]);
|
||||||
|
|
||||||
|
#define PREPARE_MASKS(n) \
|
||||||
|
m512 lo_mask = set64x8(0xf); \
|
||||||
|
m512 dup_mask[n * 2]; \
|
||||||
|
PREPARE_MASKS_##n
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||||
|
do { \
|
||||||
|
const u8 *buf_end = a->buf + a->len; \
|
||||||
|
const u8 *ptr = a->buf + a->start_offset; \
|
||||||
|
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
||||||
|
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
||||||
|
u32 last_match = (u32)-1; \
|
||||||
|
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
||||||
|
const size_t iterBytes = 128; \
|
||||||
|
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
||||||
|
a->buf, a->len, a->start_offset); \
|
||||||
|
\
|
||||||
|
const m128 *maskBase = getMaskBase(teddy); \
|
||||||
|
PREPARE_MASKS(n_msk); \
|
||||||
|
const u32 *confBase = getConfBase(teddy); \
|
||||||
|
\
|
||||||
|
const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
|
||||||
|
u32 c_0 = 0x100; \
|
||||||
|
u32 c_16 = 0x100; \
|
||||||
|
u32 c_32 = 0x100; \
|
||||||
|
u32 c_48 = 0x100; \
|
||||||
|
const u8 *mainStart = ROUNDUP_PTR(ptr, 64); \
|
||||||
|
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
|
||||||
|
if (ptr < mainStart) { \
|
||||||
|
ptr = mainStart - 64; \
|
||||||
|
m512 p_mask; \
|
||||||
|
m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset, \
|
||||||
|
a->buf, buf_end, \
|
||||||
|
a->buf_history, a->len_history, n_msk); \
|
||||||
|
m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
|
||||||
|
r_0 = or512(r_0, p_mask); \
|
||||||
|
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
|
||||||
|
ptr += 64; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (ptr + 64 <= buf_end) { \
|
||||||
|
m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
|
||||||
|
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
|
||||||
|
ptr += 64; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
|
||||||
|
__builtin_prefetch(ptr + (iterBytes * 4)); \
|
||||||
|
CHECK_FLOOD; \
|
||||||
|
m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
|
||||||
|
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
|
||||||
|
m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk); \
|
||||||
|
CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (ptr + 64 <= buf_end) { \
|
||||||
|
m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
|
||||||
|
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
|
||||||
|
ptr += 64; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
assert(ptr + 64 > buf_end); \
|
||||||
|
if (ptr < buf_end) { \
|
||||||
|
m512 p_mask; \
|
||||||
|
m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end, \
|
||||||
|
a->buf_history, a->len_history, n_msk); \
|
||||||
|
m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
|
||||||
|
r_0 = or512(r_0, p_mask); \
|
||||||
|
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
return HWLM_SUCCESS; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
|
||||||
|
|
||||||
#ifdef ARCH_64_BIT
|
#ifdef ARCH_64_BIT
|
||||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
@ -86,30 +373,10 @@ do { \
|
|||||||
u64a part2 = movq(rshiftbyte_m128(lo, 8)); \
|
u64a part2 = movq(rshiftbyte_m128(lo, 8)); \
|
||||||
u64a part3 = movq(hi); \
|
u64a part3 = movq(hi); \
|
||||||
u64a part4 = movq(rshiftbyte_m128(hi, 8)); \
|
u64a part4 = movq(rshiftbyte_m128(hi, 8)); \
|
||||||
if (unlikely(part1 != ones_u64a)) { \
|
CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
||||||
part1 = ~part1; \
|
CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
|
||||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
|
||||||
&control, &last_match); \
|
CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part2 != ones_u64a)) { \
|
|
||||||
part2 = ~part2; \
|
|
||||||
conf_fn(&part2, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part3 != ones_u64a)) { \
|
|
||||||
part3 = ~part3; \
|
|
||||||
conf_fn(&part3, bucket, offset + 16, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part4 != ones_u64a)) { \
|
|
||||||
part4 = ~part4; \
|
|
||||||
conf_fn(&part4, bucket, offset + 24, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
#else
|
#else
|
||||||
@ -126,54 +393,14 @@ do { \
|
|||||||
u32 part6 = movd(rshiftbyte_m128(hi, 4)); \
|
u32 part6 = movd(rshiftbyte_m128(hi, 4)); \
|
||||||
u32 part7 = movd(rshiftbyte_m128(hi, 8)); \
|
u32 part7 = movd(rshiftbyte_m128(hi, 8)); \
|
||||||
u32 part8 = movd(rshiftbyte_m128(hi, 12)); \
|
u32 part8 = movd(rshiftbyte_m128(hi, 12)); \
|
||||||
if (unlikely(part1 != ones_u32)) { \
|
CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||||
part1 = ~part1; \
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
|
||||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
|
||||||
&control, &last_match); \
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
|
||||||
} \
|
CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
|
||||||
if (unlikely(part2 != ones_u32)) { \
|
CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
|
||||||
part2 = ~part2; \
|
CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
|
||||||
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part3 != ones_u32)) { \
|
|
||||||
part3 = ~part3; \
|
|
||||||
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part4 != ones_u32)) { \
|
|
||||||
part4 = ~part4; \
|
|
||||||
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part5 != ones_u32)) { \
|
|
||||||
part5 = ~part5; \
|
|
||||||
conf_fn(&part5, bucket, offset + 16, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part6 != ones_u32)) { \
|
|
||||||
part6 = ~part6; \
|
|
||||||
conf_fn(&part6, bucket, offset + 20, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part7 != ones_u32)) { \
|
|
||||||
part7 = ~part7; \
|
|
||||||
conf_fn(&part7, bucket, offset + 24, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part8 != ones_u32)) { \
|
|
||||||
part8 = ~part8; \
|
|
||||||
conf_fn(&part8, bucket, offset + 28, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
#endif
|
#endif
|
||||||
@ -270,121 +497,6 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
|
|||||||
return or256(SHIFT_OR_M4, r_msk);
|
return or256(SHIFT_OR_M4, r_msk);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // not defined __AVX2__
|
|
||||||
|
|
||||||
#ifdef ARCH_64_BIT
|
|
||||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff128(var, ones128()))) { \
|
|
||||||
u64a lo = movq(var); \
|
|
||||||
u64a hi = movq(rshiftbyte_m128(var, 8)); \
|
|
||||||
if (unlikely(lo != ones_u64a)) { \
|
|
||||||
lo = ~lo; \
|
|
||||||
conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(hi != ones_u64a)) { \
|
|
||||||
hi = ~hi; \
|
|
||||||
conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#else
|
|
||||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
|
||||||
do { \
|
|
||||||
if (unlikely(diff128(var, ones128()))) { \
|
|
||||||
u32 part1 = movd(var); \
|
|
||||||
u32 part2 = movd(rshiftbyte_m128(var, 4)); \
|
|
||||||
u32 part3 = movd(rshiftbyte_m128(var, 8)); \
|
|
||||||
u32 part4 = movd(rshiftbyte_m128(var, 12)); \
|
|
||||||
if (unlikely(part1 != ones_u32)) { \
|
|
||||||
part1 = ~part1; \
|
|
||||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part2 != ones_u32)) { \
|
|
||||||
part2 = ~part2; \
|
|
||||||
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part3 != ones_u32)) { \
|
|
||||||
part3 = ~part3; \
|
|
||||||
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part4 != ones_u32)) { \
|
|
||||||
part4 = ~part4; \
|
|
||||||
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} while(0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
|
|
||||||
m128 mask = set16x8(0xf);
|
|
||||||
m128 lo = and128(val, mask);
|
|
||||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
|
||||||
return or128(pshufb_m128(maskBase[0 * 2], lo),
|
|
||||||
pshufb_m128(maskBase[0 * 2 + 1], hi));
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
|
|
||||||
m128 mask = set16x8(0xf);
|
|
||||||
m128 lo = and128(val, mask);
|
|
||||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
|
||||||
m128 r = prep_conf_teddy_m1(maskBase, val);
|
|
||||||
|
|
||||||
m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
|
|
||||||
pshufb_m128(maskBase[1 * 2 + 1], hi));
|
|
||||||
m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
|
|
||||||
*old_1 = res_1;
|
|
||||||
return or128(r, res_shifted_1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
|
||||||
m128 val) {
|
|
||||||
m128 mask = set16x8(0xf);
|
|
||||||
m128 lo = and128(val, mask);
|
|
||||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
|
||||||
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
|
|
||||||
|
|
||||||
m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
|
|
||||||
pshufb_m128(maskBase[2 * 2 + 1], hi));
|
|
||||||
m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
|
|
||||||
*old_2 = res_2;
|
|
||||||
return or128(r, res_shifted_2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
|
||||||
m128 *old_3, m128 val) {
|
|
||||||
m128 mask = set16x8(0xf);
|
|
||||||
m128 lo = and128(val, mask);
|
|
||||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
|
||||||
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
|
|
||||||
|
|
||||||
m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
|
|
||||||
pshufb_m128(maskBase[3 * 2 + 1], hi));
|
|
||||||
m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
|
|
||||||
*old_3 = res_3;
|
|
||||||
return or128(r, res_shifted_3);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __AVX2__
|
|
||||||
|
|
||||||
#if defined(__AVX2__) // reinforced teddy
|
|
||||||
|
|
||||||
#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
|
#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
|
||||||
prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
|
prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
|
||||||
|
|
||||||
@ -415,42 +527,6 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
|||||||
m256 dup_mask[n * 2]; \
|
m256 dup_mask[n * 2]; \
|
||||||
PREPARE_MASKS_##n
|
PREPARE_MASKS_##n
|
||||||
|
|
||||||
#else // not defined __AVX2__
|
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY_RES_OLD_1
|
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY_RES_OLD_2 \
|
|
||||||
m128 res_old_1 = zeroes128();
|
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY_RES_OLD_3 \
|
|
||||||
m128 res_old_1 = zeroes128(); \
|
|
||||||
m128 res_old_2 = zeroes128();
|
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY_RES_OLD_4 \
|
|
||||||
m128 res_old_1 = zeroes128(); \
|
|
||||||
m128 res_old_2 = zeroes128(); \
|
|
||||||
m128 res_old_3 = zeroes128();
|
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
|
|
||||||
|
|
||||||
#define PREP_CONF_FN_1(mask_base, val) \
|
|
||||||
prep_conf_teddy_m1(mask_base, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FN_2(mask_base, val) \
|
|
||||||
prep_conf_teddy_m2(mask_base, &res_old_1, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FN_3(mask_base, val) \
|
|
||||||
prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FN_4(mask_base, val) \
|
|
||||||
prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
|
|
||||||
|
|
||||||
#define PREP_CONF_FN(mask_base, val, n) \
|
|
||||||
PREP_CONF_FN_##n(mask_base, val)
|
|
||||||
#endif // __AVX2__
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX2__) // reinforced teddy
|
|
||||||
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||||
do { \
|
do { \
|
||||||
const u8 *buf_end = a->buf + a->len; \
|
const u8 *buf_end = a->buf + a->len; \
|
||||||
@ -517,7 +593,119 @@ do { \
|
|||||||
\
|
\
|
||||||
return HWLM_SUCCESS; \
|
return HWLM_SUCCESS; \
|
||||||
} while(0)
|
} while(0)
|
||||||
#else // not defined __AVX2__
|
|
||||||
|
#else // not defined HAVE_AVX2
|
||||||
|
|
||||||
|
#ifdef ARCH_64_BIT
|
||||||
|
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(diff128(var, ones128()))) { \
|
||||||
|
u64a lo = movq(var); \
|
||||||
|
u64a hi = movq(rshiftbyte_m128(var, 8)); \
|
||||||
|
CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
#else
|
||||||
|
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(diff128(var, ones128()))) { \
|
||||||
|
u32 part1 = movd(var); \
|
||||||
|
u32 part2 = movd(rshiftbyte_m128(var, 4)); \
|
||||||
|
u32 part3 = movd(rshiftbyte_m128(var, 8)); \
|
||||||
|
u32 part4 = movd(rshiftbyte_m128(var, 12)); \
|
||||||
|
CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
|
||||||
|
CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
|
||||||
|
m128 mask = set16x8(0xf);
|
||||||
|
m128 lo = and128(val, mask);
|
||||||
|
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||||
|
return or128(pshufb_m128(maskBase[0 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[0 * 2 + 1], hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
|
||||||
|
m128 mask = set16x8(0xf);
|
||||||
|
m128 lo = and128(val, mask);
|
||||||
|
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||||
|
m128 r = prep_conf_teddy_m1(maskBase, val);
|
||||||
|
|
||||||
|
m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[1 * 2 + 1], hi));
|
||||||
|
m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
|
||||||
|
*old_1 = res_1;
|
||||||
|
return or128(r, res_shifted_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||||
|
m128 val) {
|
||||||
|
m128 mask = set16x8(0xf);
|
||||||
|
m128 lo = and128(val, mask);
|
||||||
|
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||||
|
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
|
||||||
|
|
||||||
|
m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[2 * 2 + 1], hi));
|
||||||
|
m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
|
||||||
|
*old_2 = res_2;
|
||||||
|
return or128(r, res_shifted_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||||
|
m128 *old_3, m128 val) {
|
||||||
|
m128 mask = set16x8(0xf);
|
||||||
|
m128 lo = and128(val, mask);
|
||||||
|
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||||
|
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
|
||||||
|
|
||||||
|
m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
|
||||||
|
pshufb_m128(maskBase[3 * 2 + 1], hi));
|
||||||
|
m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
|
||||||
|
*old_3 = res_3;
|
||||||
|
return or128(r, res_shifted_3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_RES_OLD_1
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_RES_OLD_2 \
|
||||||
|
m128 res_old_1 = zeroes128();
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_RES_OLD_3 \
|
||||||
|
m128 res_old_1 = zeroes128(); \
|
||||||
|
m128 res_old_2 = zeroes128();
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_RES_OLD_4 \
|
||||||
|
m128 res_old_1 = zeroes128(); \
|
||||||
|
m128 res_old_2 = zeroes128(); \
|
||||||
|
m128 res_old_3 = zeroes128();
|
||||||
|
|
||||||
|
#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_1(mask_base, val) \
|
||||||
|
prep_conf_teddy_m1(mask_base, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_2(mask_base, val) \
|
||||||
|
prep_conf_teddy_m2(mask_base, &res_old_1, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_3(mask_base, val) \
|
||||||
|
prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN_4(mask_base, val) \
|
||||||
|
prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
|
||||||
|
|
||||||
|
#define PREP_CONF_FN(mask_base, val, n) \
|
||||||
|
PREP_CONF_FN_##n(mask_base, val)
|
||||||
|
|
||||||
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||||
do { \
|
do { \
|
||||||
const u8 *buf_end = a->buf + a->len; \
|
const u8 *buf_end = a->buf + a->len; \
|
||||||
@ -581,7 +769,8 @@ do { \
|
|||||||
\
|
\
|
||||||
return HWLM_SUCCESS; \
|
return HWLM_SUCCESS; \
|
||||||
} while(0)
|
} while(0)
|
||||||
#endif // __AVX2__
|
|
||||||
|
#endif // HAVE_AVX2 HAVE_AVX512
|
||||||
|
|
||||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||||
const struct FDR_Runtime_Args *a,
|
const struct FDR_Runtime_Args *a,
|
||||||
|
@ -109,6 +109,31 @@ const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
|
|||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(chunk != ones_u64a)) { \
|
||||||
|
chunk = ~chunk; \
|
||||||
|
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||||
|
&control, &last_match); \
|
||||||
|
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
|
||||||
|
do { \
|
||||||
|
if (unlikely(chunk != ones_u32a)) { \
|
||||||
|
chunk = ~chunk; \
|
||||||
|
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||||
|
&control, &last_match); \
|
||||||
|
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
|
||||||
|
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ARCH_64_BIT
|
#ifdef ARCH_64_BIT
|
||||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||||
do { \
|
do { \
|
||||||
@ -120,30 +145,10 @@ do { \
|
|||||||
r = interleave256hi(var, swap); \
|
r = interleave256hi(var, swap); \
|
||||||
u64a part3 = extractlow64from256(r); \
|
u64a part3 = extractlow64from256(r); \
|
||||||
u64a part4 = extract64from256(r, 1); \
|
u64a part4 = extract64from256(r, 1); \
|
||||||
if (unlikely(part1 != ones_u64a)) { \
|
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
||||||
part1 = ~part1; \
|
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
|
||||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
|
||||||
&control, &last_match); \
|
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part2 != ones_u64a)) { \
|
|
||||||
part2 = ~part2; \
|
|
||||||
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part3 != ones_u64a)) { \
|
|
||||||
part3 = ~part3; \
|
|
||||||
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part4 != ones_u64a)) { \
|
|
||||||
part4 = ~part4; \
|
|
||||||
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
#else
|
#else
|
||||||
@ -161,53 +166,14 @@ do { \
|
|||||||
u32 part6 = extract32from256(r, 1); \
|
u32 part6 = extract32from256(r, 1); \
|
||||||
u32 part7 = extract32from256(r, 2); \
|
u32 part7 = extract32from256(r, 2); \
|
||||||
u32 part8 = extract32from256(r, 3); \
|
u32 part8 = extract32from256(r, 3); \
|
||||||
if (unlikely(part1 != ones_u32)) { \
|
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||||
part1 = ~part1; \
|
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
|
||||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
|
||||||
&control, &last_match); \
|
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
|
||||||
} \
|
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
|
||||||
if (unlikely(part2 != ones_u32)) { \
|
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
|
||||||
part2 = ~part2; \
|
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
|
||||||
conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
} \
|
|
||||||
if (unlikely(part3 != ones_u32)) { \
|
|
||||||
part3 = ~part3; \
|
|
||||||
conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part4 != ones_u32)) { \
|
|
||||||
part4 = ~part4; \
|
|
||||||
conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part5 != ones_u32)) { \
|
|
||||||
part5 = ~part5; \
|
|
||||||
conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part6 != ones_u32)) { \
|
|
||||||
part6 = ~part6; \
|
|
||||||
conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part7 != ones_u32)) { \
|
|
||||||
part7 = ~part7; \
|
|
||||||
conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
if (unlikely(part8 != ones_u32)) { \
|
|
||||||
part8 = ~part8; \
|
|
||||||
conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
|
|
||||||
&control, &last_match); \
|
|
||||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
#endif
|
#endif
|
||||||
@ -277,11 +243,6 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
|||||||
return or256(r, res_shifted_3);
|
return or256(r, res_shifted_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
|
||||||
const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
|
|
||||||
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
|
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
|
||||||
do { \
|
do { \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
@ -41,7 +41,7 @@
|
|||||||
#include "util/uniform_ops.h"
|
#include "util/uniform_ops.h"
|
||||||
|
|
||||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
||||||
#if defined(__AVX2__)
|
#if defined(HAVE_AVX2)
|
||||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
|
extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
|||||||
// for start zone, see below
|
// for start zone, see below
|
||||||
// lo ptr hi hi
|
// lo ptr hi hi
|
||||||
// |----------|-------|----------------|............|
|
// |----------|-------|----------------|............|
|
||||||
// start 0 start+offset end(<=16)
|
// -start 0 -start+offset MIN(avail,16)
|
||||||
// p_mask ffff..ff0000...........00ffff..........
|
// p_mask ffff..ff0000...........00ffff..........
|
||||||
// ptr < lo:
|
// ptr < lo:
|
||||||
// only start zone.
|
// only start zone.
|
||||||
@ -182,7 +182,7 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
return u.val128;
|
return u.val128;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(HAVE_AVX2)
|
||||||
/*
|
/*
|
||||||
* \brief Copy a block of [0,31] bytes efficiently.
|
* \brief Copy a block of [0,31] bytes efficiently.
|
||||||
*
|
*
|
||||||
@ -251,7 +251,7 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
|
|||||||
// for start zone, see below
|
// for start zone, see below
|
||||||
// lo ptr hi hi
|
// lo ptr hi hi
|
||||||
// |----------|-------|----------------|............|
|
// |----------|-------|----------------|............|
|
||||||
// start 0 start+offset end(<=32)
|
// -start 0 -start+offset MIN(avail,32)
|
||||||
// p_mask ffff..ff0000...........00ffff..........
|
// p_mask ffff..ff0000...........00ffff..........
|
||||||
// ptr < lo:
|
// ptr < lo:
|
||||||
// only start zone.
|
// only start zone.
|
||||||
@ -309,7 +309,77 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
|||||||
|
|
||||||
return u.val256;
|
return u.val256;
|
||||||
}
|
}
|
||||||
#endif // __AVX2__
|
#endif // HAVE_AVX2
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512)
|
||||||
|
// Note: p_mask is an output param that initialises a poison mask.
|
||||||
|
// u64a k = ones_u64a << n' >> m'; // m' < n'
|
||||||
|
// *p_mask = set_mask_m512(~k);
|
||||||
|
// means p_mask is consist of:
|
||||||
|
// (n' - m') poison bytes "0xff" at the beginning,
|
||||||
|
// followed by (64 - n') valid bytes "0x00",
|
||||||
|
// then followed by the rest m' poison bytes "0xff".
|
||||||
|
// ptr >= lo:
|
||||||
|
// no history.
|
||||||
|
// for end/short zone, ptr==lo and start_offset==0
|
||||||
|
// for start zone, see below
|
||||||
|
// lo ptr hi hi
|
||||||
|
// |----------|-------|----------------|............|
|
||||||
|
// -start 0 -start+offset MIN(avail,64)
|
||||||
|
// p_mask ffff..ff0000...........00ffff..........
|
||||||
|
// ptr < lo:
|
||||||
|
// only start zone.
|
||||||
|
// history
|
||||||
|
// ptr lo hi hi
|
||||||
|
// |----------|-------|----------------|............|
|
||||||
|
// 0 start start+offset end(<=64)
|
||||||
|
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||||
|
static really_inline
|
||||||
|
m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||||
|
const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
|
||||||
|
const u32 nMasks) {
|
||||||
|
m512 val;
|
||||||
|
|
||||||
|
uintptr_t copy_start;
|
||||||
|
uintptr_t copy_len;
|
||||||
|
|
||||||
|
if (ptr >= lo) { // short/end/start zone
|
||||||
|
uintptr_t start = (uintptr_t)(ptr - lo);
|
||||||
|
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||||
|
if (avail >= 64) {
|
||||||
|
assert(start_offset - start <= 64);
|
||||||
|
u64a k = ones_u64a << (start_offset - start);
|
||||||
|
*p_mask = set_mask_m512(~k);
|
||||||
|
return loadu512(ptr);
|
||||||
|
}
|
||||||
|
assert(start_offset - start <= avail);
|
||||||
|
u64a k = ones_u64a << (64 - avail + start_offset - start)
|
||||||
|
>> (64 - avail);
|
||||||
|
*p_mask = set_mask_m512(~k);
|
||||||
|
copy_start = 0;
|
||||||
|
copy_len = avail;
|
||||||
|
} else { //start zone
|
||||||
|
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||||
|
MIN(hlen, nMasks - 1));
|
||||||
|
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||||
|
u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
|
||||||
|
val = loadu_maskz_m512(j, &hbuf[hlen - start]);
|
||||||
|
uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
|
||||||
|
assert(start + start_offset <= end);
|
||||||
|
u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
|
||||||
|
*p_mask = set_mask_m512(~k);
|
||||||
|
copy_start = start;
|
||||||
|
copy_len = end - start;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(copy_len < 64);
|
||||||
|
assert(copy_len > 0);
|
||||||
|
u64a j = ones_u64a >> (64 - copy_len) << copy_start;
|
||||||
|
val = loadu_mask_m512(val, j, ptr);
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
#endif // HAVE_AVX512
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
|
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
|
||||||
|
@ -755,6 +755,10 @@ m256 combine2x128(m128 hi, m128 lo) {
|
|||||||
}
|
}
|
||||||
#endif //AVX2
|
#endif //AVX2
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512)
|
||||||
|
#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
|
||||||
|
#endif
|
||||||
|
|
||||||
/****
|
/****
|
||||||
**** 384-bit Primitives
|
**** 384-bit Primitives
|
||||||
****/
|
****/
|
||||||
@ -969,6 +973,13 @@ m512 set8x64(u64a a) {
|
|||||||
return _mm512_set1_epi64(a);
|
return _mm512_set1_epi64(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
|
||||||
|
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
|
||||||
|
return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
|
||||||
|
lo_3, lo_2, lo_1, lo_0);
|
||||||
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m512 set4x128(m128 a) {
|
m512 set4x128(m128 a) {
|
||||||
return _mm512_broadcast_i32x4(a);
|
return _mm512_broadcast_i32x4(a);
|
||||||
@ -1059,6 +1070,7 @@ m512 lshift64_m512(m512 a, unsigned b) {
|
|||||||
#if defined(HAVE_AVX512)
|
#if defined(HAVE_AVX512)
|
||||||
#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
|
#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
|
||||||
#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
|
#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
|
||||||
|
#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(_MM_CMPINT_NE)
|
#if !defined(_MM_CMPINT_NE)
|
||||||
@ -1169,6 +1181,11 @@ static really_inline
|
|||||||
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
|
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
|
||||||
return _mm512_mask_loadu_epi8(src, k, ptr);
|
return _mm512_mask_loadu_epi8(src, k, ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m512 set_mask_m512(__mmask64 k) {
|
||||||
|
return _mm512_movm_epi8(k);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// packed unaligned store of first N bytes
|
// packed unaligned store of first N bytes
|
||||||
|
Loading…
x
Reference in New Issue
Block a user