fix names, use own intrinsic instead of explicit _mm* ones

This commit is contained in:
Konstantinos Margaritis 2020-09-23 11:51:21 +03:00
parent f7a6b8934c
commit 5333467249
15 changed files with 137 additions and 137 deletions

View File

@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS); sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
#define PREPARE_MASKS_1 \ #define PREPARE_MASKS_1 \
dup_mask[0] = set4x128(maskBase[0]); \ dup_mask[0] = set1_4x128(maskBase[0]); \
dup_mask[1] = set4x128(maskBase[1]); dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \ #define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \ PREPARE_MASKS_1 \
dup_mask[2] = set4x128(maskBase[2]); \ dup_mask[2] = set1_4x128(maskBase[2]); \
dup_mask[3] = set4x128(maskBase[3]); dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \ #define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \ PREPARE_MASKS_2 \
dup_mask[4] = set4x128(maskBase[4]); \ dup_mask[4] = set1_4x128(maskBase[4]); \
dup_mask[5] = set4x128(maskBase[5]); dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \ #define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \ PREPARE_MASKS_3 \
dup_mask[6] = set4x128(maskBase[6]); \ dup_mask[6] = set1_4x128(maskBase[6]); \
dup_mask[7] = set4x128(maskBase[7]); dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \ #define PREPARE_MASKS(n) \
m512 lo_mask = set64x8(0xf); \ m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \ m512 dup_mask[n * 2]; \
m512 sl_msk[n - 1]; \ m512 sl_msk[n - 1]; \
PREPARE_MASKS_##n \ PREPARE_MASKS_##n \
@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
&c_0, &c_16, &c_32, &c_48) &c_0, &c_16, &c_32, &c_48)
#define PREPARE_MASKS_1 \ #define PREPARE_MASKS_1 \
dup_mask[0] = set4x128(maskBase[0]); \ dup_mask[0] = set1_4x128(maskBase[0]); \
dup_mask[1] = set4x128(maskBase[1]); dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \ #define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \ PREPARE_MASKS_1 \
dup_mask[2] = set4x128(maskBase[2]); \ dup_mask[2] = set1_4x128(maskBase[2]); \
dup_mask[3] = set4x128(maskBase[3]); dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \ #define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \ PREPARE_MASKS_2 \
dup_mask[4] = set4x128(maskBase[4]); \ dup_mask[4] = set1_4x128(maskBase[4]); \
dup_mask[5] = set4x128(maskBase[5]); dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \ #define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \ PREPARE_MASKS_3 \
dup_mask[6] = set4x128(maskBase[6]); \ dup_mask[6] = set1_4x128(maskBase[6]); \
dup_mask[7] = set4x128(maskBase[7]); dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \ #define PREPARE_MASKS(n) \
m512 lo_mask = set64x8(0xf); \ m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \ m512 dup_mask[n * 2]; \
PREPARE_MASKS_##n PREPARE_MASKS_##n
@ -713,7 +713,7 @@ do { \
#define PREP_SHUF_MASK \ #define PREP_SHUF_MASK \
PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
*c_128 = *(ptr + 15); \ *c_128 = *(ptr + 15); \
m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
*c_0 = *(ptr + 31) *c_0 = *(ptr + 31)
#define SHIFT_OR_M1 \ #define SHIFT_OR_M1 \
@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
#define PREPARE_MASKS_1 \ #define PREPARE_MASKS_1 \
dup_mask[0] = set2x128(maskBase[0]); \ dup_mask[0] = set1_2x128(maskBase[0]); \
dup_mask[1] = set2x128(maskBase[1]); dup_mask[1] = set1_2x128(maskBase[1]);
#define PREPARE_MASKS_2 \ #define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \ PREPARE_MASKS_1 \
dup_mask[2] = set2x128(maskBase[2]); \ dup_mask[2] = set1_2x128(maskBase[2]); \
dup_mask[3] = set2x128(maskBase[3]); dup_mask[3] = set1_2x128(maskBase[3]);
#define PREPARE_MASKS_3 \ #define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \ PREPARE_MASKS_2 \
dup_mask[4] = set2x128(maskBase[4]); \ dup_mask[4] = set1_2x128(maskBase[4]); \
dup_mask[5] = set2x128(maskBase[5]); dup_mask[5] = set1_2x128(maskBase[5]);
#define PREPARE_MASKS_4 \ #define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \ PREPARE_MASKS_3 \
dup_mask[6] = set2x128(maskBase[6]); \ dup_mask[6] = set1_2x128(maskBase[6]); \
dup_mask[7] = set2x128(maskBase[7]); dup_mask[7] = set1_2x128(maskBase[7]);
#define PREPARE_MASKS(n) \ #define PREPARE_MASKS(n) \
m256 lo_mask = set32x8(0xf); \ m256 lo_mask = set1_32x8(0xf); \
m256 dup_mask[n * 2]; \ m256 dup_mask[n * 2]; \
PREPARE_MASKS_##n PREPARE_MASKS_##n
@ -925,7 +925,7 @@ do { \
static really_inline static really_inline
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
m128 mask = set16x8(0xf); m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
return or128(pshufb_m128(maskBase[0 * 2], lo), return or128(pshufb_m128(maskBase[0 * 2], lo),
@ -934,7 +934,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
static really_inline static really_inline
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
m128 mask = set16x8(0xf); m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, val); m128 r = prep_conf_teddy_m1(maskBase, val);
@ -949,7 +949,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
static really_inline static really_inline
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 val) { m128 val) {
m128 mask = set16x8(0xf); m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, val); m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
@ -964,7 +964,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
static really_inline static really_inline
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 *old_3, m128 val) { m128 *old_3, m128 val) {
m128 mask = set16x8(0xf); m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask); m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask); m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);

View File

@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *buf_history, size_t len_history, const u8 *buf_history, size_t len_history,
const u32 nMasks) { const u32 nMasks) {
m128 p_mask128; m128 p_mask128;
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks)); buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128); *p_mask = set1_2x128(p_mask128);
return ret; return ret;
} }
static really_inline static really_inline
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
m256 mask = set32x8(0xf); m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
return or256(pshufb_m256(maskBase[0 * 2], lo), return or256(pshufb_m256(maskBase[0 * 2], lo),
@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
static really_inline static really_inline
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
m256 mask = set32x8(0xf); m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val); m256 r = prep_conf_fat_teddy_m1(maskBase, val);
@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
static really_inline static really_inline
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 val) { m256 val) {
m256 mask = set32x8(0xf); m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
static really_inline static really_inline
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 *old_3, m256 val) { m256 *old_3, m256 val) {
m256 mask = set32x8(0xf); m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask); m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask); m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);

View File

@ -30,11 +30,11 @@
static really_inline m256 getMask(u8 c, bool noCase) { static really_inline m256 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase); u8 k = caseClear8(c, noCase);
return set32x8(k); return set1_32x8(k);
} }
static really_inline m256 getCaseMask(void) { static really_inline m256 getCaseMask(void) {
return set32x8(0xdf); return set1_32x8(0xdf);
} }
static really_inline static really_inline

View File

@ -30,11 +30,11 @@
static really_inline m128 getMask(u8 c, bool noCase) { static really_inline m128 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase); u8 k = caseClear8(c, noCase);
return set16x8(k); return set1_16x8(k);
} }
static really_inline m128 getCaseMask(void) { static really_inline m128 getCaseMask(void) {
return set16x8(0xdf); return set1_16x8(0xdf);
} }
static really_inline static really_inline

View File

@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) { if (len) {
m128 ss_char = load128(sherman_state); m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime); m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char)); u32 z = movemask128(eq128(ss_char, cur_char));

View File

@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) { if (len) {
m128 ss_char = load128(sherman_state); m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime); m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char)); u32 z = movemask128(eq128(ss_char, cur_char));
@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
assert(s_in); /* should not already be dead */ assert(s_in); /* should not already be dead */
assert(soft_c_end <= hard_c_end); assert(soft_c_end <= hard_c_end);
DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
m128 s = set16x8(s_in - 1); m128 s = set1_16x8(s_in - 1);
const u8 *c = *c_inout; const u8 *c = *c_inout;
const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
if (!do_accel) { if (!do_accel) {
@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) #if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
u32 sheng_limit_x4 = sheng_limit * 0x01010101; u32 sheng_limit_x4 = sheng_limit * 0x01010101;
m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
m->sheng_accel_limit, sheng_stop_limit); m->sheng_accel_limit, sheng_stop_limit);
#endif #endif

View File

@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
} }
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
m128 cur_state = set16x8(*state); m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks; const m128 *masks = s->shuffle_masks;
while (likely(cur_buf != end)) { while (likely(cur_buf != end)) {

View File

@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
} }
m128 cur_state = set16x8(*state); m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks; const m128 *masks = s->shuffle_masks;
while (likely(end - cur_buf >= 4)) { while (likely(end - cur_buf >= 4)) {

View File

@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
} }
const m128 zeroes = zeroes128(); const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf); const m128 low4bits = set1_16x8(0xf);
const u8 *rv; const u8 *rv;
size_t min = (size_t)buf % 16; size_t min = (size_t)buf % 16;
@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
} }
const m128 zeroes = zeroes128(); const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf); const m128 low4bits = set1_16x8(0xf);
const u8 *rv; const u8 *rv;
assert(buf_end - buf >= 16); assert(buf_end - buf >= 16);
@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
m128 mask2_lo, m128 mask2_hi, m128 mask2_lo, m128 mask2_hi,
const u8 *buf, const u8 *buf_end) { const u8 *buf, const u8 *buf_end) {
const m128 ones = ones128(); const m128 ones = ones128();
const m128 low4bits = _mm_set1_epi8(0xf); const m128 low4bits = set1_16x8(0xf);
const u8 *rv; const u8 *rv;
size_t min = (size_t)buf % 16; size_t min = (size_t)buf % 16;
@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end); buf, buf_end);
} }
const m256 low4bits = set32x8(0xf); const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) { if (buf_end - buf <= 32) {
return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
} }
const m256 zeroes = zeroes256(); const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo); const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi); const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv; const u8 *rv;
size_t min = (size_t)buf % 32; size_t min = (size_t)buf % 32;
@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end); buf, buf_end);
} }
const m256 low4bits = set32x8(0xf); const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) { if (buf_end - buf <= 32) {
return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
} }
const m256 zeroes = zeroes256(); const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo); const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi); const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv; const u8 *rv;
assert(buf_end - buf >= 32); assert(buf_end - buf >= 32);
@ -676,7 +676,7 @@ static really_inline
const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
m128 mask2_hi, const u8 *buf, const u8 *buf_end) { m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
const m256 low4bits = set32x8(0xf); const m256 low4bits = set1_32x8(0xf);
// run shufti over two overlapping 16-byte unaligned reads // run shufti over two overlapping 16-byte unaligned reads
const m256 mask1 = combine2x128(mask1_hi, mask1_lo); const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
const m256 mask2 = combine2x128(mask2_hi, mask2_lo); const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
} }
const m256 ones = ones256(); const m256 ones = ones256();
const m256 low4bits = set32x8(0xf); const m256 low4bits = set1_32x8(0xf);
const m256 wide_mask1_lo = set2x128(mask1_lo); const m256 wide_mask1_lo = set1_2x128(mask1_lo);
const m256 wide_mask1_hi = set2x128(mask1_hi); const m256 wide_mask1_hi = set1_2x128(mask1_hi);
const m256 wide_mask2_lo = set2x128(mask2_lo); const m256 wide_mask2_lo = set1_2x128(mask2_lo);
const m256 wide_mask2_hi = set2x128(mask2_hi); const m256 wide_mask2_hi = set1_2x128(mask2_hi);
const u8 *rv; const u8 *rv;
size_t min = (size_t)buf % 32; size_t min = (size_t)buf % 32;

View File

@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline static really_inline
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
m128 highconst = _mm_set1_epi8(0x80); m128 highconst = set1_16x8(0x80);
m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
// and now do the real work // and now do the real work
m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline static really_inline
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
m256 highconst = _mm256_set1_epi8(0x80); m256 highconst = set1_32x8(0x80);
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
// and now do the real work // and now do the real work
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset, m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) { const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("len %zu\n", buf_end - buf); DEBUG_PRINTF("len %zu\n", buf_end - buf);
const m256 wide_clear = set2x128(shuf_mask_lo_highclear); const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset); const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end); assert(buf && buf_end);
assert(buf < buf_end); assert(buf < buf_end);
@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset, m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) { const u8 *buf, const u8 *buf_end) {
const m256 wide_clear = set2x128(shuf_mask_lo_highclear); const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset); const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end); assert(buf && buf_end);
assert(buf < buf_end); assert(buf < buf_end);
const u8 *rv; const u8 *rv;

View File

@ -36,7 +36,7 @@
#define VERM_BOUNDARY 16 #define VERM_BOUNDARY 16
#define VERM_TYPE m128 #define VERM_TYPE m128
#define VERM_SET_FN set16x8 #define VERM_SET_FN set1_16x8
static really_inline static really_inline
const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
@ -74,7 +74,7 @@ static really_inline
const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) { const u8 *buf_end, char negate) {
assert((size_t)buf % 16 == 0); assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 31 < buf_end; buf += 32) { for (; buf + 31 < buf_end; buf += 32) {
m128 data = load128(buf); m128 data = load128(buf);
@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found // returns NULL if not found
static really_inline static really_inline
const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data))); u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) { if (negate) {
@ -157,7 +157,7 @@ static really_inline
const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) { const u8 *buf, const u8 *buf_end) {
assert((size_t)buf % 16 == 0); assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf += 16) { for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf); m128 data = load128(buf);
@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline static really_inline
const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */ /* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned m128 data = loadu128(buf); // unaligned
m128 v = and128(casemask, data); m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v), u32 z = movemask128(and128(eq128(chars1, v),
@ -277,7 +277,7 @@ static really_inline
const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) { const u8 *buf_end, char negate) {
assert((size_t)buf_end % 16 == 0); assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 15 < buf_end; buf_end -= 16) { for (; buf + 15 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16); m128 data = load128(buf_end - 16);
@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found // returns NULL if not found
static really_inline static really_inline
const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data))); u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) { if (negate) {
@ -344,7 +344,7 @@ static really_inline
const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) { const u8 *buf, const u8 *buf_end) {
assert((size_t)buf_end % 16 == 0); assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf_end -= 16) { for (; buf + 16 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16); m128 data = load128(buf_end - 16);
@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline static really_inline
const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */ /* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR); m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); m128 data = loadu128(buf);
m128 v = and128(casemask, data); m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v), u32 z = movemask128(and128(eq128(chars2, v),
@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
#define VERM_BOUNDARY 64 #define VERM_BOUNDARY 64
#define VERM_TYPE m512 #define VERM_TYPE m512
#define VERM_SET_FN set64x8 #define VERM_SET_FN set1_64x8
static really_inline static really_inline
const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {

View File

@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
u32 count = *count_inout; u32 count = *count_inout;
m128 chars = set16x8(c); m128 chars = set1_16x8(c);
for (; d + 16 <= d_end; d_end -= 16) { for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16); m128 data = loadu128(d_end - 16);
@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
u32 count = *count_inout; u32 count = *count_inout;
const m128 zeroes = zeroes128(); const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf); const m128 low4bits = set1_16x8(0xf);
for (; d + 16 <= d_end; d_end -= 16) { for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16); m128 data = loadu128(d_end - 16);

View File

@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
return 1; return 1;
} }
m256 data_m256 = set2x128(data); m256 data_m256 = set1_2x128(data);
m256 hi_mask_m256 = loadu256(hi_mask); m256 hi_mask_m256 = loadu256(hi_mask);
m256 lo_mask_m256 = loadu256(lo_mask); m256 lo_mask_m256 = loadu256(lo_mask);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
m128 hi_mask_m128 = loadu128(hi_mask); m128 hi_mask_m128 = loadu128(hi_mask);
m128 lo_mask_m128 = loadu128(lo_mask); m128 lo_mask_m128 = loadu128(lo_mask);
m256 hi_mask_m256 = set2x128(hi_mask_m128); m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
m256 lo_mask_m256 = set2x128(lo_mask_m128); m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256, if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
bucket_select_mask_m256, bucket_select_mask_m256,
@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo); DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x2(valid_hi, valid_lo); expand_valid = set2x64(valid_hi, valid_lo);
valid_path_mask = ~movemask128(pshufb_m128(expand_valid, valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
data_select_mask)); data_select_mask));
} }
@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
u32 valid_data_mask; u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask); m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_double = set2x128(data_m128); m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask); m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0; u32 valid_path_mask = 0;
@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo); DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi, expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo); valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid, valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask)); data_select_mask));
@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
u32 valid_data_mask; u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask); m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_double = set2x128(data_m128); m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask); m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0; u32 valid_path_mask = 0;
@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo); DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi, expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo); valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid, valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask)); data_select_mask));
@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
u32 valid_data_mask; u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask); m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_m256 = set2x128(data_m128); m256 data_m256 = set1_2x128(data_m128);
m256 data_select_mask_1 = loadu256(ri->data_select_mask); m256 data_select_mask_1 = loadu256(ri->data_select_mask);
m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32); m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo); DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi, expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo); valid_lo);
u32 valid_path_1 = movemask256(pshufb_m256(expand_valid, u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
data_select_mask_1)); data_select_mask_1));

View File

@ -47,7 +47,7 @@ static really_inline
int validateShuftiMask16x16(const m256 data, const m256 hi_mask, int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask, const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) { const u32 neg_mask, const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask, m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4)); rshift64_m256(andnot256(low4bits, data), 4));
@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
const m128 and_mask, const u32 neg_mask, const m128 and_mask, const u32 neg_mask,
const u32 valid_data_mask) { const u32 valid_data_mask) {
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data); m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits)); m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 nresult = eq128(and128(t, and_mask), zeroes128()); m128 nresult = eq128(and128(t, and_mask), zeroes128());
@ -101,7 +101,7 @@ static really_inline
int validateShuftiMask32x8(const m256 data, const m256 hi_mask, int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask, const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) { const u32 neg_mask, const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask, m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4)); rshift64_m256(andnot256(low4bits, data), 4));
@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data,
const m256 bucket_mask_hi, const m256 bucket_mask_hi,
const m256 bucket_mask_lo, const u32 neg_mask, const m256 bucket_mask_lo, const u32 neg_mask,
const u32 valid_data_mask) { const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits); m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
const u32 neg_mask, const u32 neg_mask,
const u32 valid_path_mask) { const u32 valid_path_mask) {
m256 data_256 = combine2x128(rshift64_m128(data, 4), data); m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits)); m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 result = and128(t, bucket_select_mask); m128 result = and128(t, bucket_select_mask);
@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data,
const u32 hi_bits, const u32 lo_bits, const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask, const u32 neg_mask,
const u32 valid_path_mask) { const u32 valid_path_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits); m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo = pshufb_m256(lo_mask, data_lo); m256 c_lo = pshufb_m256(lo_mask, data_lo);
@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data,
const u32 hi_bits, const u32 lo_bits, const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask, const u32 neg_mask,
const u32 valid_path_mask) { const u32 valid_path_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits); m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
const u64a hi_bits, const u64a lo_bits, const u64a hi_bits, const u64a lo_bits,
const u64a neg_mask, const u64a neg_mask,
const u64a valid_path_mask) { const u64a valid_path_mask) {
m256 low4bits = set32x8(0xf); m256 low4bits = set1_32x8(0xf);
m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits)); m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits)); m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
m256 c_hi_1 = pshufb_m256(hi_mask, m256 c_hi_1 = pshufb_m256(hi_mask,

View File

@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
expand32(v[2], m[2]), expand32(v[3], m[3]) }; expand32(v[2], m[2]), expand32(v[3], m[3]) };
return _mm_set_epi32(x[3], x[2], x[1], x[0]); return set32x4(x[3], x[2], x[1], x[0]);
} }
#endif #endif
@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
static really_inline static really_inline
m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
// First, decompose our vectors into 64-bit chunks. // First, decompose our vectors into 64-bit chunks.
u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
u64a v[2]; u64a v[2];
@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
return _mm_set_epi64x(x[1], x[0]); return set2x64(x[1], x[0]);
} }
#endif #endif
@ -264,11 +264,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
expand32(v[6], m[6]), expand32(v[7], m[7]) }; expand32(v[6], m[6]), expand32(v[7], m[7]) };
#if !defined(HAVE_AVX2) #if !defined(HAVE_AVX2)
m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; .hi = set32x4(x[7], x[6], x[5], x[4]) };
#else #else
m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], m256 xvec = set32x8(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
#endif #endif
return xvec; return xvec;
} }
@ -291,10 +291,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
expand64(v[2], m[2]), expand64(v[3], m[3]) }; expand64(v[2], m[2]), expand64(v[3], m[3]) };
#if !defined(HAVE_AVX2) #if !defined(HAVE_AVX2)
m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), m256 xvec = { .lo = set2x64(x[1], x[0]),
.hi = _mm_set_epi64x(x[3], x[2]) }; .hi = set2x64(x[3], x[2]) };
#else #else
m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
#endif #endif
return xvec; return xvec;
} }
@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[8], m[8]), expand32(v[9], m[9]),
expand32(v[10], m[10]), expand32(v[11], m[11]) }; expand32(v[10], m[10]), expand32(v[11], m[11]) };
m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
.mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), .mid = set32x4(x[7], x[6], x[5], x[4]),
.hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; .hi = set32x4(x[11], x[10], x[9], x[8]) };
return xvec; return xvec;
} }
#endif #endif
@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[2], m[2]), expand64(v[3], m[3]),
expand64(v[4], m[4]), expand64(v[5], m[5]) }; expand64(v[4], m[4]), expand64(v[5], m[5]) };
m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), m384 xvec = { .lo = set2x64(x[1], x[0]),
.mid = _mm_set_epi64x(x[3], x[2]), .mid = set2x64(x[3], x[2]),
.hi = _mm_set_epi64x(x[5], x[4]) }; .hi = set2x64(x[5], x[4]) };
return xvec; return xvec;
} }
#endif #endif
@ -548,20 +548,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
m512 xvec; m512 xvec;
#if defined(HAVE_AVX512) #if defined(HAVE_AVX512)
xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], xvec = set32x16(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8], x[11], x[10], x[9], x[8],
x[7], x[6], x[5], x[4], x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2) #elif defined(HAVE_AVX2)
xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], xvec.lo = set32x8(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], xvec.hi = set32x8(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8]); x[11], x[10], x[9], x[8]);
#else #else
xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
#endif #endif
return xvec; return xvec;
} }
@ -588,16 +588,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
expand64(v[6], m[6]), expand64(v[7], m[7]) }; expand64(v[6], m[6]), expand64(v[7], m[7]) };
#if defined(HAVE_AVX512) #if defined(HAVE_AVX512)
m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], m512 xvec = set64x8(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2) #elif defined(HAVE_AVX2)
m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
.hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; .hi = set4x64(x[7], x[6], x[5], x[4])};
#else #else
m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), m512 xvec = { .lo = { set2x64(x[1], x[0]),
_mm_set_epi64x(x[3], x[2]) }, set2x64(x[3], x[2]) },
.hi = { _mm_set_epi64x(x[5], x[4]), .hi = { set2x64(x[5], x[4]),
_mm_set_epi64x(x[7], x[6]) } }; set2x64(x[7], x[6]) } };
#endif #endif
return xvec; return xvec;
} }