diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 960e2a41..97cff0b4 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = { sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS); #define PREPARE_MASKS_1 \ - dup_mask[0] = set4x128(maskBase[0]); \ - dup_mask[1] = set4x128(maskBase[1]); + dup_mask[0] = set1_4x128(maskBase[0]); \ + dup_mask[1] = set1_4x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set4x128(maskBase[2]); \ - dup_mask[3] = set4x128(maskBase[3]); + dup_mask[2] = set1_4x128(maskBase[2]); \ + dup_mask[3] = set1_4x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set4x128(maskBase[4]); \ - dup_mask[5] = set4x128(maskBase[5]); + dup_mask[4] = set1_4x128(maskBase[4]); \ + dup_mask[5] = set1_4x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set4x128(maskBase[6]); \ - dup_mask[7] = set4x128(maskBase[7]); + dup_mask[6] = set1_4x128(maskBase[6]); \ + dup_mask[7] = set1_4x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m512 lo_mask = set64x8(0xf); \ + m512 lo_mask = set1_64x8(0xf); \ m512 dup_mask[n * 2]; \ m512 sl_msk[n - 1]; \ PREPARE_MASKS_##n \ @@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, &c_0, &c_16, &c_32, &c_48) #define PREPARE_MASKS_1 \ - dup_mask[0] = set4x128(maskBase[0]); \ - dup_mask[1] = set4x128(maskBase[1]); + dup_mask[0] = set1_4x128(maskBase[0]); \ + dup_mask[1] = set1_4x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set4x128(maskBase[2]); \ - dup_mask[3] = set4x128(maskBase[3]); + dup_mask[2] = set1_4x128(maskBase[2]); \ + dup_mask[3] = set1_4x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set4x128(maskBase[4]); \ - dup_mask[5] = set4x128(maskBase[5]); + dup_mask[4] = set1_4x128(maskBase[4]); \ + dup_mask[5] = set1_4x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set4x128(maskBase[6]); \ - dup_mask[7] = set4x128(maskBase[7]); + dup_mask[6] = set1_4x128(maskBase[6]); \ + dup_mask[7] = set1_4x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m512 lo_mask = set64x8(0xf); \ + m512 lo_mask = set1_64x8(0xf); \ m512 dup_mask[n * 2]; \ PREPARE_MASKS_##n @@ -713,7 +713,7 @@ do { \ #define PREP_SHUF_MASK \ PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ *c_128 = *(ptr + 15); \ - m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ + m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ *c_0 = *(ptr + 31) #define SHIFT_OR_M1 \ @@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask, prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) #define PREPARE_MASKS_1 \ - dup_mask[0] = set2x128(maskBase[0]); \ - dup_mask[1] = set2x128(maskBase[1]); + dup_mask[0] = set1_2x128(maskBase[0]); \ + dup_mask[1] = set1_2x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set2x128(maskBase[2]); \ - dup_mask[3] = set2x128(maskBase[3]); + dup_mask[2] = set1_2x128(maskBase[2]); \ + dup_mask[3] = set1_2x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set2x128(maskBase[4]); \ - dup_mask[5] = set2x128(maskBase[5]); + dup_mask[4] = set1_2x128(maskBase[4]); \ + dup_mask[5] = set1_2x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set2x128(maskBase[6]); \ - dup_mask[7] = set2x128(maskBase[7]); + dup_mask[6] = set1_2x128(maskBase[6]); \ + dup_mask[7] = set1_2x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m256 lo_mask = set32x8(0xf); \ + m256 lo_mask = set1_32x8(0xf); \ m256 dup_mask[n * 2]; \ PREPARE_MASKS_##n @@ -925,7 +925,7 @@ do { \ static really_inline m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); return or128(pshufb_m128(maskBase[0 * 2], lo), @@ -934,7 +934,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { static really_inline m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m1(maskBase, val); @@ -949,7 +949,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { static really_inline m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m2(maskBase, old_1, val); @@ -964,7 +964,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, static really_inline m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 *old_3, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index 20ea938c..df54fc62 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset, const u8 *buf_history, size_t len_history, const u32 nMasks) { m128 p_mask128; - m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, + m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, buf_history, len_history, nMasks)); - *p_mask = set2x128(p_mask128); + *p_mask = set1_2x128(p_mask128); return ret; } static really_inline m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); return or256(pshufb_m256(maskBase[0 * 2], lo), @@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { static really_inline m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m1(maskBase, val); @@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { static really_inline m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); @@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, static really_inline m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 *old_3, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c index 5edc646a..49fe168f 100644 --- a/src/hwlm/noodle_engine_avx2.c +++ b/src/hwlm/noodle_engine_avx2.c @@ -30,11 +30,11 @@ static really_inline m256 getMask(u8 c, bool noCase) { u8 k = caseClear8(c, noCase); - return set32x8(k); + return set1_32x8(k); } static really_inline m256 getCaseMask(void) { - return set32x8(0xdf); + return set1_32x8(0xdf); } static really_inline diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 7cd53d7c..5d47768d 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -30,11 +30,11 @@ static really_inline m128 getMask(u8 c, bool noCase) { u8 k = caseClear8(c, noCase); - return set16x8(k); + return set1_16x8(k); } static really_inline m128 getCaseMask(void) { - return set16x8(0xdf); + return set1_16x8(0xdf); } static really_inline diff --git a/src/nfa/mcclellan_common_impl.h b/src/nfa/mcclellan_common_impl.h index 7b0e7f48..6ec1b1f1 100644 --- a/src/nfa/mcclellan_common_impl.h +++ b/src/nfa/mcclellan_common_impl.h @@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, if (len) { m128 ss_char = load128(sherman_state); - m128 cur_char = set16x8(cprime); + m128 cur_char = set1_16x8(cprime); u32 z = movemask128(eq128(ss_char, cur_char)); diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index 4619ff6f..dd00617e 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, if (len) { m128 ss_char = load128(sherman_state); - m128 cur_char = set16x8(cprime); + m128 cur_char = set1_16x8(cprime); u32 z = movemask128(eq128(ss_char, cur_char)); @@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, assert(s_in); /* should not already be dead */ assert(soft_c_end <= hard_c_end); DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); - m128 s = set16x8(s_in - 1); + m128 s = set1_16x8(s_in - 1); const u8 *c = *c_inout; const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; if (!do_accel) { @@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) u32 sheng_limit_x4 = sheng_limit * 0x01010101; - m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); - m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); + m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4); + m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit); DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, m->sheng_accel_limit, sheng_stop_limit); #endif diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index 9552fe15..aa416194 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); - m128 cur_state = set16x8(*state); + m128 cur_state = set1_16x8(*state); const m128 *masks = s->shuffle_masks; while (likely(cur_buf != end)) { diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index 74032201..c51bcdea 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } - m128 cur_state = set16x8(*state); + m128 cur_state = set1_16x8(*state); const m128 *masks = s->shuffle_masks; while (likely(end - cur_buf >= 4)) { diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 09ffc0cf..e76dcca8 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; @@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; assert(buf_end - buf >= 16); @@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { const m128 ones = ones128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; @@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, buf, buf_end); } - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); if (buf_end - buf <= 32) { return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); } const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set2x128(mask_lo); - const m256 wide_mask_hi = set2x128(mask_hi); + const m256 wide_mask_lo = set1_2x128(mask_lo); + const m256 wide_mask_hi = set1_2x128(mask_hi); const u8 *rv; size_t min = (size_t)buf % 32; @@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, buf, buf_end); } - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); if (buf_end - buf <= 32) { return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); } const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set2x128(mask_lo); - const m256 wide_mask_hi = set2x128(mask_hi); + const m256 wide_mask_lo = set1_2x128(mask_lo); + const m256 wide_mask_hi = set1_2x128(mask_hi); const u8 *rv; assert(buf_end - buf >= 32); @@ -676,7 +676,7 @@ static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); // run shufti over two overlapping 16-byte unaligned reads const m256 mask1 = combine2x128(mask1_hi, mask1_lo); const m256 mask2 = combine2x128(mask2_hi, mask2_lo); @@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, } const m256 ones = ones256(); - const m256 low4bits = set32x8(0xf); - const m256 wide_mask1_lo = set2x128(mask1_lo); - const m256 wide_mask1_hi = set2x128(mask1_hi); - const m256 wide_mask2_lo = set2x128(mask2_lo); - const m256 wide_mask2_hi = set2x128(mask2_hi); + const m256 low4bits = set1_32x8(0xf); + const m256 wide_mask1_lo = set1_2x128(mask1_lo); + const m256 wide_mask1_hi = set1_2x128(mask1_hi); + const m256 wide_mask2_lo = set1_2x128(mask2_lo); + const m256 wide_mask2_hi = set1_2x128(mask2_hi); const u8 *rv; size_t min = (size_t)buf % 32; diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index be6b312c..37af13ad 100644 --- a/src/nfa/truffle.c +++ b/src/nfa/truffle.c @@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { - m128 highconst = _mm_set1_epi8(0x80); - m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); + m128 highconst = set1_16x8(0x80); + m128 shuf_mask_hi = set1_2x64(0x8040201008040201); // and now do the real work m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); @@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { - m256 highconst = _mm256_set1_epi8(0x80); - m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); + m256 highconst = set1_32x8(0x80); + m256 shuf_mask_hi = set1_4x64(0x8040201008040201); // and now do the real work m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); @@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("len %zu\n", buf_end - buf); - const m256 wide_clear = set2x128(shuf_mask_lo_highclear); - const m256 wide_set = set2x128(shuf_mask_lo_highset); + const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear); + const m256 wide_set = set1_2x128(shuf_mask_lo_highset); assert(buf && buf_end); assert(buf < buf_end); @@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { - const m256 wide_clear = set2x128(shuf_mask_lo_highclear); - const m256 wide_set = set2x128(shuf_mask_lo_highset); + const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear); + const m256 wide_set = set1_2x128(shuf_mask_lo_highset); assert(buf && buf_end); assert(buf < buf_end); const u8 *rv; diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 3307486c..dc56a5f1 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -36,7 +36,7 @@ #define VERM_BOUNDARY 16 #define VERM_TYPE m128 -#define VERM_SET_FN set16x8 +#define VERM_SET_FN set1_16x8 static really_inline const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, @@ -74,7 +74,7 @@ static really_inline const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { assert((size_t)buf % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 31 < buf_end; buf += 32) { m128 data = load128(buf); @@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { // returns NULL if not found static really_inline const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned u32 z = movemask128(eq128(chars, and128(casemask, data))); if (negate) { @@ -157,7 +157,7 @@ static really_inline const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *buf, const u8 *buf_end) { assert((size_t)buf % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 16 < buf_end; buf += 16) { m128 data = load128(buf); @@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { static really_inline const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars1, v), @@ -277,7 +277,7 @@ static really_inline const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { assert((size_t)buf_end % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 15 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); @@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { // returns NULL if not found static really_inline const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned u32 z = movemask128(eq128(chars, and128(casemask, data))); if (negate) { @@ -344,7 +344,7 @@ static really_inline const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *buf, const u8 *buf_end) { assert((size_t)buf_end % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 16 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); @@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { static really_inline const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars2, v), @@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { #define VERM_BOUNDARY 64 #define VERM_TYPE m512 -#define VERM_SET_FN set64x8 +#define VERM_SET_FN set1_64x8 static really_inline const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h index 976208b7..6210fca5 100644 --- a/src/rose/counting_miracle.h +++ b/src/rose/counting_miracle.h @@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end, u32 count = *count_inout; - m128 chars = set16x8(c); + m128 chars = set1_16x8(c); for (; d + 16 <= d_end; d_end -= 16) { m128 data = loadu128(d_end - 16); @@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, u32 count = *count_inout; const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); for (; d + 16 <= d_end; d_end -= 16) { m128 data = loadu128(d_end - 16); diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 0f2d1083..d01e30e8 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask, return 1; } - m256 data_m256 = set2x128(data); + m256 data_m256 = set1_2x128(data); m256 hi_mask_m256 = loadu256(hi_mask); m256 lo_mask_m256 = loadu256(lo_mask); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); @@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask, m128 hi_mask_m128 = loadu128(hi_mask); m128 lo_mask_m128 = loadu128(lo_mask); - m256 hi_mask_m256 = set2x128(hi_mask_m128); - m256 lo_mask_m256 = set2x128(lo_mask_m128); + m256 hi_mask_m256 = set1_2x128(hi_mask_m128); + m256 lo_mask_m256 = set1_2x128(lo_mask_m128); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256, bucket_select_mask_m256, @@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x2(valid_hi, valid_lo); + expand_valid = set2x64(valid_hi, valid_lo); valid_path_mask = ~movemask128(pshufb_m128(expand_valid, data_select_mask)); } @@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_double = set2x128(data_m128); + m256 data_double = set1_2x128(data_m128); m256 data_select_mask = loadu256(ri->data_select_mask); u32 valid_path_mask = 0; @@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); valid_path_mask = ~movemask256(pshufb_m256(expand_valid, data_select_mask)); @@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_double = set2x128(data_m128); + m256 data_double = set1_2x128(data_m128); m256 data_select_mask = loadu256(ri->data_select_mask); u32 valid_path_mask = 0; @@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); valid_path_mask = ~movemask256(pshufb_m256(expand_valid, data_select_mask)); @@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_m256 = set2x128(data_m128); + m256 data_m256 = set1_2x128(data_m128); m256 data_select_mask_1 = loadu256(ri->data_select_mask); m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32); @@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); u32 valid_path_1 = movemask256(pshufb_m256(expand_valid, data_select_mask_1)); diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h index 1dc855d9..3b91f091 100644 --- a/src/rose/validate_shufti.h +++ b/src/rose/validate_shufti.h @@ -47,7 +47,7 @@ static really_inline int validateShuftiMask16x16(const m256 data, const m256 hi_mask, const m256 lo_mask, const m256 and_mask, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_hi = pshufb_m256(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); @@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask, const m128 and_mask, const u32 neg_mask, const u32 valid_data_mask) { m256 data_m256 = combine2x128(rshift64_m128(data, 4), data); - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 nresult = eq128(and128(t, and_mask), zeroes128()); @@ -101,7 +101,7 @@ static really_inline int validateShuftiMask32x8(const m256 data, const m256 hi_mask, const m256 lo_mask, const m256 and_mask, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_hi = pshufb_m256(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); @@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data, const m256 bucket_mask_hi, const m256 bucket_mask_lo, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); @@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data, const u32 neg_mask, const u32 valid_path_mask) { m256 data_256 = combine2x128(rshift64_m128(data, 4), data); - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 result = and128(t, bucket_select_mask); @@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data, const u32 hi_bits, const u32 lo_bits, const u32 neg_mask, const u32 valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo = pshufb_m256(lo_mask, data_lo); @@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data, const u32 hi_bits, const u32 lo_bits, const u32 neg_mask, const u32 valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); @@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2, const u64a hi_bits, const u64a lo_bits, const u64a neg_mask, const u64a valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits)); m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits)); m256 c_hi_1 = pshufb_m256(hi_mask, diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 7238849e..e6cf205c 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]) }; - return _mm_set_epi32(x[3], x[2], x[1], x[0]); + return set32x4(x[3], x[2], x[1], x[0]); } #endif @@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 64-bit chunks. - u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; + u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u64a v[2]; @@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - return _mm_set_epi64x(x[1], x[0]); + return set2x64(x[1], x[0]); } #endif @@ -264,11 +264,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { expand32(v[6], m[6]), expand32(v[7], m[7]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; + m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .hi = set32x4(x[7], x[6], x[5], x[4]) }; #else - m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); + m256 xvec = set32x8(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); #endif return xvec; } @@ -291,10 +291,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .hi = _mm_set_epi64x(x[3], x[2]) }; + m256 xvec = { .lo = set2x64(x[1], x[0]), + .hi = set2x64(x[3], x[2]) }; #else - m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); + m256 xvec = set4x64(x[3], x[2], x[1], x[0]); #endif return xvec; } @@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]) }; - m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), - .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; + m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .mid = set32x4(x[7], x[6], x[5], x[4]), + .hi = set32x4(x[11], x[10], x[9], x[8]) }; return xvec; } #endif @@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]) }; - m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .mid = _mm_set_epi64x(x[3], x[2]), - .hi = _mm_set_epi64x(x[5], x[4]) }; + m384 xvec = { .lo = set2x64(x[1], x[0]), + .mid = set2x64(x[3], x[2]), + .hi = set2x64(x[5], x[4]) }; return xvec; } #endif @@ -548,20 +548,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { m512 xvec; #if defined(HAVE_AVX512) - xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], - x[11], x[10], x[9], x[8], - x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); + xvec = set32x16(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8], + x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) - xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); - xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], - x[11], x[10], x[9], x[8]); + xvec.lo = set32x8(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); + xvec.hi = set32x8(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8]); #else - xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); - xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); - xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); - xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); + xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); + xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); + xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); + xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); #endif return xvec; } @@ -588,16 +588,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { expand64(v[6], m[6]), expand64(v[7], m[7]) }; #if defined(HAVE_AVX512) - m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], + m512 xvec = set64x8(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) - m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), - .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; + m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]), + .hi = set4x64(x[7], x[6], x[5], x[4])}; #else - m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), - _mm_set_epi64x(x[3], x[2]) }, - .hi = { _mm_set_epi64x(x[5], x[4]), - _mm_set_epi64x(x[7], x[6]) } }; + m512 xvec = { .lo = { set2x64(x[1], x[0]), + set2x64(x[3], x[2]) }, + .hi = { set2x64(x[5], x[4]), + set2x64(x[7], x[6]) } }; #endif return xvec; }