diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index aa9d1c1d..c79db037 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -131,7 +131,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft, u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1); tmp &= fdr->domainMask; s = *((const m128 *)ft + tmp); - s = shiftRight8Bits(s); + s = rshiftbyte_m128(s, 1); } else { s = fdr->start; } @@ -185,20 +185,20 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, m128 st14 = *(const m128 *)(ft + v14*8); m128 st15 = *(const m128 *)(ft + v15*8); - st1 = byteShiftLeft128(st1, 1); - st2 = byteShiftLeft128(st2, 2); - st3 = byteShiftLeft128(st3, 3); - st4 = byteShiftLeft128(st4, 4); - st5 = byteShiftLeft128(st5, 5); - st6 = byteShiftLeft128(st6, 6); - st7 = byteShiftLeft128(st7, 7); - st9 = byteShiftLeft128(st9, 1); - st10 = byteShiftLeft128(st10, 2); - st11 = byteShiftLeft128(st11, 3); - st12 = byteShiftLeft128(st12, 4); - st13 = byteShiftLeft128(st13, 5); - st14 = byteShiftLeft128(st14, 6); - st15 = byteShiftLeft128(st15, 7); + st1 = lshiftbyte_m128(st1, 1); + st2 = lshiftbyte_m128(st2, 2); + st3 = lshiftbyte_m128(st3, 3); + st4 = lshiftbyte_m128(st4, 4); + st5 = lshiftbyte_m128(st5, 5); + st6 = lshiftbyte_m128(st6, 6); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st10 = lshiftbyte_m128(st10, 2); + st11 = lshiftbyte_m128(st11, 3); + st12 = lshiftbyte_m128(st12, 4); + st13 = lshiftbyte_m128(st13, 5); + st14 = lshiftbyte_m128(st14, 6); + st15 = lshiftbyte_m128(st15, 7); *s = or128(*s, st0); *s = or128(*s, st1); @@ -209,7 +209,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, *s = or128(*s, st6); *s = or128(*s, st7); *conf0 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; *s = or128(*s, st8); @@ -221,7 +221,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, *s = or128(*s, st14); *s = or128(*s, st15); *conf8 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } @@ -252,19 +252,19 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, m128 st12 = *(const m128 *)(ft + v12*8); m128 st14 = *(const m128 *)(ft + v14*8); - st2 = byteShiftLeft128(st2, 2); - st4 = byteShiftLeft128(st4, 4); - st6 = byteShiftLeft128(st6, 6); - st10 = byteShiftLeft128(st10, 2); - st12 = byteShiftLeft128(st12, 4); - st14 = byteShiftLeft128(st14, 6); + st2 = lshiftbyte_m128(st2, 2); + st4 = lshiftbyte_m128(st4, 4); + st6 = lshiftbyte_m128(st6, 6); + st10 = lshiftbyte_m128(st10, 2); + st12 = lshiftbyte_m128(st12, 4); + st14 = lshiftbyte_m128(st14, 6); *s = or128(*s, st0); *s = or128(*s, st2); *s = or128(*s, st4); *s = or128(*s, st6); *conf0 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; *s = or128(*s, st8); @@ -272,7 +272,7 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, *s = or128(*s, st12); *s = or128(*s, st14); *conf8 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } @@ -295,19 +295,19 @@ void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, m128 st8 = *(const m128 *)(ft + v8*8); m128 st12 = *(const m128 *)(ft + v12*8); - st4 = byteShiftLeft128(st4, 4); - st12 = byteShiftLeft128(st12, 4); + st4 = lshiftbyte_m128(st4, 4); + st12 = lshiftbyte_m128(st12, 4); *s = or128(*s, st0); *s = or128(*s, st4); *conf0 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; *s = or128(*s, st8); *s = or128(*s, st12); *conf8 = movq(*s); - *s = byteShiftRight128(*s, 8); + *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 4ff0b18e..2406a167 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -79,7 +79,7 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { do { \ if (unlikely(isnonzero128(var))) { \ u64a lo = movq(var); \ - u64a hi = movq(byteShiftRight128(var, 8)); \ + u64a hi = movq(rshiftbyte_m128(var, 8)); \ if (unlikely(lo)) { \ conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \ control, &last_match); \ @@ -97,9 +97,9 @@ do { \ do { \ if (unlikely(isnonzero128(var))) { \ u32 part1 = movd(var); \ - u32 part2 = movd(byteShiftRight128(var, 4)); \ - u32 part3 = movd(byteShiftRight128(var, 8)); \ - u32 part4 = movd(byteShiftRight128(var, 12)); \ + u32 part2 = movd(rshiftbyte_m128(var, 4)); \ + u32 part3 = movd(rshiftbyte_m128(var, 8)); \ + u32 part4 = movd(rshiftbyte_m128(var, 12)); \ if (unlikely(part1)) { \ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ control, &last_match); \ @@ -128,7 +128,7 @@ static really_inline m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) { m128 mask = set16x8(0xf); m128 lo = and128(val, mask); - m128 hi = and128(rshift2x64(val, 4), mask); + m128 hi = and128(rshift64_m128(val, 4), mask); return and128(and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi)), p_mask); } @@ -138,7 +138,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask, m128 val) { m128 mask = set16x8(0xf); m128 lo = and128(val, mask); - m128 hi = and128(rshift2x64(val, 4), mask); + m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m1(maskBase, p_mask, val); m128 res_1 = and128(pshufb(maskBase[1*2], lo), @@ -153,7 +153,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 p_mask, m128 val) { m128 mask = set16x8(0xf); m128 lo = and128(val, mask); - m128 hi = and128(rshift2x64(val, 4), mask); + m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val); m128 res_2 = and128(pshufb(maskBase[2*2], lo), @@ -168,7 +168,7 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 *old_3, m128 p_mask, m128 val) { m128 mask = set16x8(0xf); m128 lo = and128(val, mask); - m128 hi = and128(rshift2x64(val, 4), mask); + m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val); m128 res_3 = and128(pshufb(maskBase[3*2], lo), diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index ef06813c..5ea4e368 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -371,7 +371,7 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) { 64 * (offset); *arrCnt += 1; } - u64a part_1 = movq(byteShiftRight128(var, 8)); + u64a part_1 = movq(rshiftbyte_m128(var, 8)); while (unlikely(part_1)) { bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + 64 * (offset + 1); @@ -384,19 +384,19 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) { 32 * (offset * 2); *arrCnt += 1; } - u32 part_1 = movd(byteShiftRight128(var, 4)); + u32 part_1 = movd(rshiftbyte_m128(var, 4)); while (unlikely(part_1)) { bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + 32 * (offset * 2 + 1); *arrCnt += 1; } - u32 part_2 = movd(byteShiftRight128(var, 8)); + u32 part_2 = movd(rshiftbyte_m128(var, 8)); while (unlikely(part_2)) { bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) + 32 * (offset * 2 + 2); *arrCnt += 1; } - u32 part_3 = movd(byteShiftRight128(var, 12)); + u32 part_3 = movd(rshiftbyte_m128(var, 12)); while (unlikely(part_3)) { bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) + 32 * (offset * 2 + 3); @@ -410,7 +410,7 @@ static really_inline m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) { m256 mask = set32x8(0xf); m256 lo = and256(val, mask); - m256 hi = and256(rshift4x64(val, 4), mask); + m256 hi = and256(rshift64_m256(val, 4), mask); return and256(and256(vpshufb(maskBase[0*2], lo), vpshufb(maskBase[0*2+1], hi)), p_mask); } @@ -420,7 +420,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask, m256 val) { m256 mask = set32x8(0xf); m256 lo = and256(val, mask); - m256 hi = and256(rshift4x64(val, 4), mask); + m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val); m256 res_1 = and256(vpshufb(maskBase[1*2], lo), @@ -435,7 +435,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 p_mask, m256 val) { m256 mask = set32x8(0xf); m256 lo = and256(val, mask); - m256 hi = and256(rshift4x64(val, 4), mask); + m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val); m256 res_2 = and256(vpshufb(maskBase[2*2], lo), @@ -450,7 +450,7 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 *old_3, m256 p_mask, m256 val) { m256 mask = set32x8(0xf); m256 lo = and256(val, mask); - m256 hi = and256(rshift4x64(val, 4), mask); + m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val); m256 res_3 = and256(vpshufb(maskBase[3*2], lo), @@ -464,7 +464,7 @@ static really_inline m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi, m256 p_mask) { m256 lo = and256(val, mask); - m256 hi = and256(rshift4x64(val, 4), mask); + m256 hi = and256(rshift64_m256(val, 4), mask); m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi)); return and256(res, p_mask); } diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index b3673246..40575409 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -115,7 +115,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key, v = and128(v, caseMask); } - u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v))); + u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); @@ -142,7 +143,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset, v = and128(v, caseMask); } - u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v))); + u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); // mask out where we can't match u32 buf_off = start - offset; diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h index 70601e27..e0c182fc 100644 --- a/src/nfa/limex_runtime.h +++ b/src/nfa/limex_runtime.h @@ -75,7 +75,7 @@ struct proto_cache { // Shift macros for Limited NFAs. Defined in terms of uniform ops. // LimExNFAxxx ptr in 'limex' and the current state in 's' #define NFA_EXEC_LIM_SHIFT(nels_type, nels_i) \ - (JOIN(shift_, nels_type)( \ + (JOIN(lshift_, nels_type)( \ JOIN(and_, nels_type)(s, \ JOIN(load_, nels_type)(&limex->shift[nels_i])), \ limex->shiftAmount[nels_i])) diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 5aba9847..903e04da 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -40,7 +40,6 @@ #include "shufti_common.h" - /** \brief Naive byte-by-byte implementation. */ static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, @@ -234,7 +233,7 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, m128 c2_lo = pshufb(mask2_lo, chars_lo); m128 c2_hi = pshufb(mask2_hi, chars_hi); - m128 t2 = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi))); + m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); #ifdef DEBUG DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); @@ -471,7 +470,7 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, m256 c2_lo = vpshufb(mask2_lo, chars_lo); m256 c2_hi = vpshufb(mask2_hi, chars_hi); - m256 t2 = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi))); + m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); #ifdef DEBUG DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h index 84835665..e63ad27a 100644 --- a/src/nfa/shufti_common.h +++ b/src/nfa/shufti_common.h @@ -93,7 +93,7 @@ DUMP_MSK(128) #endif #define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4) +#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, @@ -119,7 +119,7 @@ DUMP_MSK(256) #endif #define GET_LO_4(chars) and256(chars, low4bits) -#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4) +#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h index 593a605e..7368e550 100644 --- a/src/nfa/truffle_common.h +++ b/src/nfa/truffle_common.h @@ -48,7 +48,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) { return NULL; // no match } -#define shift128r(a, b) _mm_srli_epi64((a), (b)) static really_inline u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { @@ -59,7 +58,7 @@ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { m128 shuf1 = pshufb(shuf_mask_lo_highclear, v); m128 t1 = xor128(v, highconst); m128 shuf2 = pshufb(shuf_mask_lo_highset, t1); - m128 t2 = andnot128(highconst, shift128r(v, 4)); + m128 t2 = andnot128(highconst, rshift64_m128(v, 4)); m128 shuf3 = pshufb(shuf_mask_hi, t2); m128 tmp = and128(or128(shuf1, shuf2), shuf3); m128 tmp2 = eq128(tmp, zeroes128()); @@ -102,7 +101,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) { return NULL; // no match } -#define shift256r(a, b) _mm256_srli_epi64((a), (b)) static really_inline u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { @@ -113,7 +111,7 @@ u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v); m256 t1 = xor256(v, highconst); m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1); - m256 t2 = andnot256(highconst, shift256r(v, 4)); + m256 t2 = andnot256(highconst, rshift64_m256(v, 4)); m256 shuf3 = vpshufb(shuf_mask_hi, t2); m256 tmp = and256(or256(shuf1, shuf2), shuf3); m256 tmp2 = eq256(tmp, zeroes256()); diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 1883a44c..0749470f 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -138,7 +138,7 @@ const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, for (; buf + 16 < buf_end; buf += 16) { m128 data = load128(buf); u32 z = movemask128(and128(eq128(chars1, data), - shiftRight8Bits(eq128(chars2, data)))); + rshiftbyte_m128(eq128(chars2, data), 1))); if (buf[15] == c1 && buf[16] == c2) { z |= (1 << 15); } @@ -161,7 +161,7 @@ const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, m128 data = load128(buf); m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars1, v), - shiftRight8Bits(eq128(chars2, v)))); + rshiftbyte_m128(eq128(chars2, v), 1))); if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { z |= (1 << 15); } @@ -182,8 +182,10 @@ const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, for (; buf + 16 < buf_end; buf += 16) { m128 data = load128(buf); - u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)), - shiftRight8Bits(eq128(chars2, and128(data, mask2))))); + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); + if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { z |= (1 << 15); } @@ -201,7 +203,7 @@ static really_inline const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { m128 data = loadu128(buf); // unaligned u32 z = movemask128(and128(eq128(chars1, data), - shiftRight8Bits(eq128(chars2, data)))); + rshiftbyte_m128(eq128(chars2, data), 1))); /* no fixup of the boundary required - the aligned run will pick it up */ if (unlikely(z)) { @@ -219,7 +221,7 @@ const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { m128 data = loadu128(buf); // unaligned m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars1, v), - shiftRight8Bits(eq128(chars2, v)))); + rshiftbyte_m128(eq128(chars2, v), 1))); /* no fixup of the boundary required - the aligned run will pick it up */ if (unlikely(z)) { @@ -234,8 +236,9 @@ static really_inline const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, m128 mask1, m128 mask2, const u8 *buf) { m128 data = loadu128(buf); // unaligned - u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)), - shiftRight8Bits(eq128(chars2, and128(data, mask2))))); + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); /* no fixup of the boundary required - the aligned run will pick it up */ if (unlikely(z)) { @@ -324,7 +327,7 @@ const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, for (; buf + 16 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); u32 z = movemask128(and128(eq128(chars2, data), - shiftLeft8Bits(eq128(chars1, data)))); + lshiftbyte_m128(eq128(chars1, data), 1))); if (buf_end[-17] == c1 && buf_end[-16] == c2) { z |= 1; } @@ -345,7 +348,7 @@ const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, m128 data = load128(buf_end - 16); m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars2, v), - shiftLeft8Bits(eq128(chars1, v)))); + lshiftbyte_m128(eq128(chars1, v), 1))); if ((buf_end[-17] & CASE_CLEAR) == c1 && (buf_end[-16] & CASE_CLEAR) == c2) { z |= 1; @@ -362,7 +365,7 @@ static really_inline const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { m128 data = loadu128(buf); u32 z = movemask128(and128(eq128(chars2, data), - shiftLeft8Bits(eq128(chars1, data)))); + lshiftbyte_m128(eq128(chars1, data), 1))); /* no fixup of the boundary required - the aligned run will pick it up */ if (unlikely(z)) { @@ -380,7 +383,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { m128 data = loadu128(buf); m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars2, v), - shiftLeft8Bits(eq128(chars1, v)))); + lshiftbyte_m128(eq128(chars1, v), 1))); /* no fixup of the boundary required - the aligned run will pick it up */ if (unlikely(z)) { return lastMatchOffset(buf + 16, z); diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h index cd84d052..76db5a77 100644 --- a/src/rose/counting_miracle.h +++ b/src/rose/counting_miracle.h @@ -82,7 +82,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end, } #define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4) +#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) static really_inline u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index d3dba9a3..5f557ba5 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -149,8 +149,8 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) { #endif } -#define shift2x64(a, b) _mm_slli_epi64((a), (b)) -#define rshift2x64(a, b) _mm_srli_epi64((a), (b)) +#define lshift64_m128(a, b) _mm_slli_epi64((a), (b)) +#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) #define eq128(a, b) _mm_cmpeq_epi8((a), (b)) #define movemask128(a) ((u32)_mm_movemask_epi8((a))) @@ -172,16 +172,8 @@ static really_inline u64a movq(const m128 in) { #endif } -static really_inline m128 shiftRight8Bits(m128 a) { - return _mm_srli_si128(a,1); -} - -static really_inline m128 shiftLeft8Bits(m128 a) { - return _mm_slli_si128(a,1); -} - -#define byteShiftRight128(a, count_immed) _mm_srli_si128(a, count_immed) -#define byteShiftLeft128(a, count_immed) _mm_slli_si128(a, count_immed) +#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) +#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) #if !defined(__AVX2__) // TODO: this entire file needs restructuring - this carveout is awful @@ -191,8 +183,8 @@ static really_inline m128 shiftLeft8Bits(m128 a) { #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2) #else -#define extract32from256(a, imm) movd(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8)) -#define extract64from256(a, imm) movq(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8)) +#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8)) +#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8)) #endif #endif // !AVX2 @@ -213,10 +205,6 @@ static really_inline m128 andnot128(m128 a, m128 b) { return _mm_andnot_si128(a, b); } -// The shift amount is an immediate, so we define these operations as macros on -// Intel SIMD. -#define shift128(a, b) _mm_slli_epi64((a), (b)) - // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); @@ -335,8 +323,8 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { ****/ #if defined(__AVX2__) -#define shift4x64(a, b) _mm256_slli_epi64((a), (b)) -#define rshift4x64(a, b) _mm256_srli_epi64((a), (b)) +#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b)) +#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) static really_inline m256 set32x8(u32 in) { @@ -354,18 +342,18 @@ m256 set2x128(m128 a) { #else static really_inline -m256 shift4x64(m256 a, int b) { +m256 lshift64_m256(m256 a, int b) { m256 rv = a; - rv.lo = shift2x64(rv.lo, b); - rv.hi = shift2x64(rv.hi, b); + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); return rv; } static really_inline -m256 rshift4x64(m256 a, int b) { +m256 rshift64_m256(m256 a, int b) { m256 rv = a; - rv.lo = rshift2x64(rv.lo, b); - rv.hi = rshift2x64(rv.hi, b); + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); return rv; } static really_inline @@ -461,18 +449,6 @@ static really_inline m256 andnot256(m256 a, m256 b) { } #endif -// The shift amount is an immediate -#if defined(__AVX2__) -#define shift256(a, b) _mm256_slli_epi64((a), (b)) -#else -static really_really_inline m256 shift256(m256 a, unsigned b) { - m256 rv; - rv.lo = shift128(a.lo, b); - rv.hi = shift128(a.hi, b); - return rv; -} -#endif - static really_inline int diff256(m256 a, m256 b) { #if defined(__AVX2__) return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); @@ -673,21 +649,12 @@ m128 movdq_lo(m256 x) { return _mm256_extracti128_si256(x, 0); } -static really_inline -m256 shift256Right8Bits(m256 a) { - return _mm256_srli_si256(a, 1); -} - -static really_inline -m256 shift256Left8Bits(m256 a) { - return _mm256_slli_si256(a, 1); -} #define cast256to128(a) _mm256_castsi256_si128(a) #define cast128to256(a) _mm256_castsi128_si256(a) #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -#define byteShiftRight256(a, count_immed) _mm256_srli_si256(a, count_immed) -#define byteShiftLeft256(a, count_immed) _mm256_slli_si256(a, count_immed) +#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) @@ -741,11 +708,12 @@ static really_inline m384 andnot384(m384 a, m384 b) { } // The shift amount is an immediate -static really_really_inline m384 shift384(m384 a, unsigned b) { +static really_really_inline +m384 lshift64_m384(m384 a, unsigned b) { m384 rv; - rv.lo = shift128(a.lo, b); - rv.mid = shift128(a.mid, b); - rv.hi = shift128(a.hi, b); + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); return rv; } @@ -913,10 +881,11 @@ static really_inline m512 andnot512(m512 a, m512 b) { } // The shift amount is an immediate -static really_really_inline m512 shift512(m512 a, unsigned b) { +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { m512 rv; - rv.lo = shift256(a.lo, b); - rv.hi = shift256(a.hi, b); + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); return rv; } diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h index 45ea4108..0619c7e4 100644 --- a/src/util/uniform_ops.h +++ b/src/util/uniform_ops.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -125,12 +125,12 @@ #define andnot_m384(a, b) (andnot384(a, b)) #define andnot_m512(a, b) (andnot512(a, b)) -#define shift_u32(a, b) ((a) << (b)) -#define shift_u64a(a, b) ((a) << (b)) -#define shift_m128(a, b) (shift128(a, b)) -#define shift_m256(a, b) (shift256(a, b)) -#define shift_m384(a, b) (shift384(a, b)) -#define shift_m512(a, b) (shift512(a, b)) +#define lshift_u32(a, b) ((a) << (b)) +#define lshift_u64a(a, b) ((a) << (b)) +#define lshift_m128(a, b) (lshift64_m128(a, b)) +#define lshift_m256(a, b) (lshift64_m256(a, b)) +#define lshift_m384(a, b) (lshift64_m384(a, b)) +#define lshift_m512(a, b) (lshift64_m512(a, b)) #define isZero_u8(a) ((a) == 0) #define isZero_u32(a) ((a) == 0) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index e95f7533..3c07b2b0 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -643,50 +643,50 @@ TEST(SimdUtilsTest, variableByteShift128) { char base[] = "0123456789ABCDEF"; m128 in = loadu128(base); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 0), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 1), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 2), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 3), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 4), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 5), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 6), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 7), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 8), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 9), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9))); - EXPECT_TRUE(!diff128(byteShiftRight128(in, 10), + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 0), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 1), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 2), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 3), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 4), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 5), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 6), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 7), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 8), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 9), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9))); - EXPECT_TRUE(!diff128(byteShiftLeft128(in, 10), + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10))); EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));