fix names, use own intrinsic instead of explicit _mm* ones

This commit is contained in:
Konstantinos Margaritis
2020-09-23 11:51:21 +03:00
parent f7a6b8934c
commit 5333467249
15 changed files with 137 additions and 137 deletions

View File

@@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) {
m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime);
m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));

View File

@@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) {
m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime);
m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));
@@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
assert(s_in); /* should not already be dead */
assert(soft_c_end <= hard_c_end);
DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
m128 s = set16x8(s_in - 1);
m128 s = set1_16x8(s_in - 1);
const u8 *c = *c_inout;
const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
if (!do_accel) {
@@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
u32 sheng_limit_x4 = sheng_limit * 0x01010101;
m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
m->sheng_accel_limit, sheng_stop_limit);
#endif

View File

@@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
}
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
m128 cur_state = set16x8(*state);
m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(cur_buf != end)) {

View File

@@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
return MO_CONTINUE_MATCHING;
}
m128 cur_state = set16x8(*state);
m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(end - cur_buf >= 4)) {

View File

@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
}
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
@@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
}
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
assert(buf_end - buf >= 16);
@@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
m128 mask2_lo, m128 mask2_hi,
const u8 *buf, const u8 *buf_end) {
const m128 ones = ones128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
@@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end);
}
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi);
const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;
@@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end);
}
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi);
const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
assert(buf_end - buf >= 32);
@@ -676,7 +676,7 @@ static really_inline
const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
// run shufti over two overlapping 16-byte unaligned reads
const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
@@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
}
const m256 ones = ones256();
const m256 low4bits = set32x8(0xf);
const m256 wide_mask1_lo = set2x128(mask1_lo);
const m256 wide_mask1_hi = set2x128(mask1_hi);
const m256 wide_mask2_lo = set2x128(mask2_lo);
const m256 wide_mask2_hi = set2x128(mask2_hi);
const m256 low4bits = set1_32x8(0xf);
const m256 wide_mask1_lo = set1_2x128(mask1_lo);
const m256 wide_mask1_hi = set1_2x128(mask1_hi);
const m256 wide_mask2_lo = set1_2x128(mask2_lo);
const m256 wide_mask2_hi = set1_2x128(mask2_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;

View File

@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
m128 highconst = _mm_set1_epi8(0x80);
m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
m128 highconst = set1_16x8(0x80);
m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
// and now do the real work
m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
@@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
m256 highconst = _mm256_set1_epi8(0x80);
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
m256 highconst = set1_32x8(0x80);
m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
// and now do the real work
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
@@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("len %zu\n", buf_end - buf);
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset);
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
@@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset);
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
const u8 *rv;

View File

@@ -36,7 +36,7 @@
#define VERM_BOUNDARY 16
#define VERM_TYPE m128
#define VERM_SET_FN set16x8
#define VERM_SET_FN set1_16x8
static really_inline
const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
@@ -74,7 +74,7 @@ static really_inline
const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 31 < buf_end; buf += 32) {
m128 data = load128(buf);
@@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found
static really_inline
const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
@@ -157,7 +157,7 @@ static really_inline
const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
@@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline
const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v),
@@ -277,7 +277,7 @@ static really_inline
const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 15 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
@@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found
static really_inline
const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
@@ -344,7 +344,7 @@ static really_inline
const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
@@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline
const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v),
@@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
#define VERM_BOUNDARY 64
#define VERM_TYPE m512
#define VERM_SET_FN set64x8
#define VERM_SET_FN set1_64x8
static really_inline
const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {