mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
rename vpshufb to pshufb_m256
This commit is contained in:
parent
eabe408e2b
commit
a295c96198
@ -147,8 +147,8 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
|||||||
m256 mask = set32x8(0xf);
|
m256 mask = set32x8(0xf);
|
||||||
m256 lo = and256(val, mask);
|
m256 lo = and256(val, mask);
|
||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||||
return and256(vpshufb(maskBase[0*2], lo),
|
return and256(pshufb_m256(maskBase[0*2], lo),
|
||||||
vpshufb(maskBase[0*2+1], hi));
|
pshufb_m256(maskBase[0*2+1], hi));
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
@ -158,8 +158,8 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
|||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||||
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
||||||
|
|
||||||
m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
|
m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
|
||||||
vpshufb(maskBase[1*2+1], hi));
|
pshufb_m256(maskBase[1*2+1], hi));
|
||||||
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
|
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
|
||||||
*old_1 = res_1;
|
*old_1 = res_1;
|
||||||
return and256(r, res_shifted_1);
|
return and256(r, res_shifted_1);
|
||||||
@ -173,8 +173,8 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
|||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||||
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
||||||
|
|
||||||
m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
|
m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
|
||||||
vpshufb(maskBase[2*2+1], hi));
|
pshufb_m256(maskBase[2*2+1], hi));
|
||||||
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
|
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
|
||||||
*old_2 = res_2;
|
*old_2 = res_2;
|
||||||
return and256(r, res_shifted_2);
|
return and256(r, res_shifted_2);
|
||||||
@ -188,8 +188,8 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
|||||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||||
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
||||||
|
|
||||||
m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
|
m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
|
||||||
vpshufb(maskBase[3*2+1], hi));
|
pshufb_m256(maskBase[3*2+1], hi));
|
||||||
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
|
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
|
||||||
*old_3 = res_3;
|
*old_3 = res_3;
|
||||||
return and256(r, res_shifted_3);
|
return and256(r, res_shifted_3);
|
||||||
|
@ -54,7 +54,7 @@ u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
|
|||||||
static really_inline
|
static really_inline
|
||||||
u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
|
u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
|
||||||
// vpshufb doesn't cross lanes, so this is a bit of a cheat
|
// vpshufb doesn't cross lanes, so this is a bit of a cheat
|
||||||
m256 shuffled = vpshufb(s, permute);
|
m256 shuffled = pshufb_m256(s, permute);
|
||||||
m256 compared = and256(shuffled, compare);
|
m256 compared = and256(shuffled, compare);
|
||||||
u32 rv = ~movemask256(eq256(compared, shuffled));
|
u32 rv = ~movemask256(eq256(compared, shuffled));
|
||||||
// stitch the lane-wise results back together
|
// stitch the lane-wise results back together
|
||||||
|
@ -373,8 +373,8 @@ DUMP_MSK(256)
|
|||||||
static really_inline
|
static really_inline
|
||||||
u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
|
u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
|
||||||
const m256 compare) {
|
const m256 compare) {
|
||||||
m256 c_lo = vpshufb(mask_lo, GET_LO_4(chars));
|
m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
|
||||||
m256 c_hi = vpshufb(mask_hi, GET_HI_4(chars));
|
m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
|
||||||
m256 t = and256(c_lo, c_hi);
|
m256 t = and256(c_lo, c_hi);
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -407,7 +407,7 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
|
|||||||
// do the hi and lo shuffles in the one avx register
|
// do the hi and lo shuffles in the one avx register
|
||||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||||
c = and256(c, low4bits);
|
c = and256(c, low4bits);
|
||||||
m256 c_shuf = vpshufb(mask, c);
|
m256 c_shuf = pshufb_m256(mask, c);
|
||||||
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
||||||
// the upper 32-bits can't match
|
// the upper 32-bits can't match
|
||||||
u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
|
u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
|
||||||
@ -516,8 +516,8 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
|
|||||||
static really_inline
|
static really_inline
|
||||||
const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
|
const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
|
||||||
const m256 low4bits, const m256 zeroes) {
|
const m256 low4bits, const m256 zeroes) {
|
||||||
m256 c_lo = vpshufb(mask_lo, GET_LO_4(chars));
|
m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
|
||||||
m256 c_hi = vpshufb(mask_hi, GET_HI_4(chars));
|
m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
|
||||||
m256 t = and256(c_lo, c_hi);
|
m256 t = and256(c_lo, c_hi);
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -538,7 +538,7 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
|
|||||||
// do the hi and lo shuffles in the one avx register
|
// do the hi and lo shuffles in the one avx register
|
||||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||||
c = and256(c, low4bits);
|
c = and256(c, low4bits);
|
||||||
m256 c_shuf = vpshufb(mask, c);
|
m256 c_shuf = pshufb_m256(mask, c);
|
||||||
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
||||||
// the upper 32-bits can't match
|
// the upper 32-bits can't match
|
||||||
u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
|
u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
|
||||||
@ -630,8 +630,8 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
|
|||||||
DEBUG_PRINTF("buf %p\n", buf);
|
DEBUG_PRINTF("buf %p\n", buf);
|
||||||
m256 chars_lo = GET_LO_4(chars);
|
m256 chars_lo = GET_LO_4(chars);
|
||||||
m256 chars_hi = GET_HI_4(chars);
|
m256 chars_hi = GET_HI_4(chars);
|
||||||
m256 c_lo = vpshufb(mask1_lo, chars_lo);
|
m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
|
||||||
m256 c_hi = vpshufb(mask1_hi, chars_hi);
|
m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
|
||||||
m256 t = or256(c_lo, c_hi);
|
m256 t = or256(c_lo, c_hi);
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -642,8 +642,8 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
|
|||||||
DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
|
DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
m256 c2_lo = vpshufb(mask2_lo, chars_lo);
|
m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
|
||||||
m256 c2_hi = vpshufb(mask2_hi, chars_hi);
|
m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
|
||||||
m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
|
m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -662,8 +662,8 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
|
|||||||
// do the hi and lo shuffles in the one avx register
|
// do the hi and lo shuffles in the one avx register
|
||||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||||
c = and256(c, low4bits);
|
c = and256(c, low4bits);
|
||||||
m256 c_shuf1 = vpshufb(mask1, c);
|
m256 c_shuf1 = pshufb_m256(mask1, c);
|
||||||
m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);
|
m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1);
|
||||||
m256 t0 = or256(c_shuf1, c_shuf2);
|
m256 t0 = or256(c_shuf1, c_shuf2);
|
||||||
m128 t = or128(movdq_hi(t0), cast256to128(t0));
|
m128 t = or128(movdq_hi(t0), cast256to128(t0));
|
||||||
// the upper 32-bits can't match
|
// the upper 32-bits can't match
|
||||||
|
@ -264,11 +264,11 @@ u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
|
|||||||
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
|
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
|
||||||
|
|
||||||
// and now do the real work
|
// and now do the real work
|
||||||
m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
|
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
|
||||||
m256 t1 = xor256(v, highconst);
|
m256 t1 = xor256(v, highconst);
|
||||||
m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
|
m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
|
||||||
m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
|
m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
|
||||||
m256 shuf3 = vpshufb(shuf_mask_hi, t2);
|
m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
|
||||||
m256 tmp = and256(or256(shuf1, shuf2), shuf3);
|
m256 tmp = and256(or256(shuf1, shuf2), shuf3);
|
||||||
m256 tmp2 = eq256(tmp, zeroes256());
|
m256 tmp2 = eq256(tmp, zeroes256());
|
||||||
u32 z = movemask256(tmp2);
|
u32 z = movemask256(tmp2);
|
||||||
|
@ -1334,11 +1334,11 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
|
|||||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||||
valid_lo);
|
valid_lo);
|
||||||
valid_path_mask = ~movemask256(vpshufb(expand_valid,
|
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
|
||||||
data_select_mask));
|
data_select_mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
m256 data = vpshufb(data_double, data_select_mask);
|
m256 data = pshufb_m256(data_double, data_select_mask);
|
||||||
m256 hi_mask = loadu2x128(ri->hi_mask);
|
m256 hi_mask = loadu2x128(ri->hi_mask);
|
||||||
m256 lo_mask = loadu2x128(ri->lo_mask);
|
m256 lo_mask = loadu2x128(ri->lo_mask);
|
||||||
m256 bucket_select_mask = loadu256(ri->bucket_select_mask);
|
m256 bucket_select_mask = loadu256(ri->bucket_select_mask);
|
||||||
@ -1395,11 +1395,11 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
|
|||||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||||
valid_lo);
|
valid_lo);
|
||||||
valid_path_mask = ~movemask256(vpshufb(expand_valid,
|
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
|
||||||
data_select_mask));
|
data_select_mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
m256 data = vpshufb(data_double, data_select_mask);
|
m256 data = pshufb_m256(data_double, data_select_mask);
|
||||||
|
|
||||||
m256 hi_mask_1 = loadu2x128(ri->hi_mask);
|
m256 hi_mask_1 = loadu2x128(ri->hi_mask);
|
||||||
m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16);
|
m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16);
|
||||||
@ -1463,15 +1463,15 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
|
|||||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||||
valid_lo);
|
valid_lo);
|
||||||
u32 valid_path_1 = movemask256(vpshufb(expand_valid,
|
u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
|
||||||
data_select_mask_1));
|
data_select_mask_1));
|
||||||
u32 valid_path_2 = movemask256(vpshufb(expand_valid,
|
u32 valid_path_2 = movemask256(pshufb_m256(expand_valid,
|
||||||
data_select_mask_2));
|
data_select_mask_2));
|
||||||
valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32);
|
valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
m256 data_1 = vpshufb(data_m256, data_select_mask_1);
|
m256 data_1 = pshufb_m256(data_m256, data_select_mask_1);
|
||||||
m256 data_2 = vpshufb(data_m256, data_select_mask_2);
|
m256 data_2 = pshufb_m256(data_m256, data_select_mask_2);
|
||||||
|
|
||||||
m256 hi_mask = loadu2x128(ri->hi_mask);
|
m256 hi_mask = loadu2x128(ri->hi_mask);
|
||||||
m256 lo_mask = loadu2x128(ri->lo_mask);
|
m256 lo_mask = loadu2x128(ri->lo_mask);
|
||||||
|
@ -48,8 +48,9 @@ int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
|
|||||||
const m256 lo_mask, const m256 and_mask,
|
const m256 lo_mask, const m256 and_mask,
|
||||||
const u32 neg_mask, const u32 valid_data_mask) {
|
const u32 neg_mask, const u32 valid_data_mask) {
|
||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
|
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
|
||||||
m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
|
m256 c_hi = pshufb_m256(hi_mask,
|
||||||
|
rshift64_m256(andnot256(low4bits, data), 4));
|
||||||
m256 t = and256(c_lo, c_hi);
|
m256 t = and256(c_lo, c_hi);
|
||||||
u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
|
u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -78,7 +79,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
|
|||||||
const u32 valid_data_mask) {
|
const u32 valid_data_mask) {
|
||||||
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
|
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
|
||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 c_nib = vpshufb(nib_mask, and256(data_m256, low4bits));
|
m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
|
||||||
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
||||||
m128 nresult = eq128(and128(t, and_mask), zeroes128());
|
m128 nresult = eq128(and128(t, and_mask), zeroes128());
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -101,8 +102,9 @@ int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
|
|||||||
const m256 lo_mask, const m256 and_mask,
|
const m256 lo_mask, const m256 and_mask,
|
||||||
const u32 neg_mask, const u32 valid_data_mask) {
|
const u32 neg_mask, const u32 valid_data_mask) {
|
||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
|
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
|
||||||
m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
|
m256 c_hi = pshufb_m256(hi_mask,
|
||||||
|
rshift64_m256(andnot256(low4bits, data), 4));
|
||||||
m256 t = and256(c_lo, c_hi);
|
m256 t = and256(c_lo, c_hi);
|
||||||
m256 nresult = eq256(and256(t, and_mask), zeroes256());
|
m256 nresult = eq256(and256(t, and_mask), zeroes256());
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -134,10 +136,10 @@ int validateShuftiMask32x16(const m256 data,
|
|||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 data_lo = and256(data, low4bits);
|
m256 data_lo = and256(data, low4bits);
|
||||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||||
m256 c_lo_1 = vpshufb(lo_mask_1, data_lo);
|
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
|
||||||
m256 c_lo_2 = vpshufb(lo_mask_2, data_lo);
|
m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
|
||||||
m256 c_hi_1 = vpshufb(hi_mask_1, data_hi);
|
m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
|
||||||
m256 c_hi_2 = vpshufb(hi_mask_2, data_hi);
|
m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
|
||||||
m256 t1 = and256(c_lo_1, c_hi_1);
|
m256 t1 = and256(c_lo_1, c_hi_1);
|
||||||
m256 t2 = and256(c_lo_2, c_hi_2);
|
m256 t2 = and256(c_lo_2, c_hi_2);
|
||||||
m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
|
m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
|
||||||
@ -200,7 +202,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
|
|||||||
const u32 valid_path_mask) {
|
const u32 valid_path_mask) {
|
||||||
m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
|
m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
|
||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 c_nib = vpshufb(nib_mask, and256(data_256, low4bits));
|
m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
|
||||||
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
||||||
m128 result = and128(t, bucket_select_mask);
|
m128 result = and128(t, bucket_select_mask);
|
||||||
u32 nresult = movemask128(eq128(result, zeroes128()));
|
u32 nresult = movemask128(eq128(result, zeroes128()));
|
||||||
@ -221,8 +223,8 @@ int validateMultipathShuftiMask32x8(const m256 data,
|
|||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 data_lo = and256(data, low4bits);
|
m256 data_lo = and256(data, low4bits);
|
||||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||||
m256 c_lo = vpshufb(lo_mask, data_lo);
|
m256 c_lo = pshufb_m256(lo_mask, data_lo);
|
||||||
m256 c_hi = vpshufb(hi_mask, data_hi);
|
m256 c_hi = pshufb_m256(hi_mask, data_hi);
|
||||||
m256 c = and256(c_lo, c_hi);
|
m256 c = and256(c_lo, c_hi);
|
||||||
m256 result = and256(c, bucket_select_mask);
|
m256 result = and256(c, bucket_select_mask);
|
||||||
u32 nresult = movemask256(eq256(result, zeroes256()));
|
u32 nresult = movemask256(eq256(result, zeroes256()));
|
||||||
@ -245,10 +247,10 @@ int validateMultipathShuftiMask32x16(const m256 data,
|
|||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 data_lo = and256(data, low4bits);
|
m256 data_lo = and256(data, low4bits);
|
||||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||||
m256 c_lo_1 = vpshufb(lo_mask_1, data_lo);
|
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
|
||||||
m256 c_lo_2 = vpshufb(lo_mask_2, data_lo);
|
m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
|
||||||
m256 c_hi_1 = vpshufb(hi_mask_1, data_hi);
|
m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
|
||||||
m256 c_hi_2 = vpshufb(hi_mask_2, data_hi);
|
m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
|
||||||
m256 t1 = and256(c_lo_1, c_hi_1);
|
m256 t1 = and256(c_lo_1, c_hi_1);
|
||||||
m256 t2 = and256(c_lo_2, c_hi_2);
|
m256 t2 = and256(c_lo_2, c_hi_2);
|
||||||
m256 result = or256(and256(t1, bucket_select_mask_lo),
|
m256 result = or256(and256(t1, bucket_select_mask_lo),
|
||||||
@ -270,11 +272,11 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
|
|||||||
const u64a neg_mask,
|
const u64a neg_mask,
|
||||||
const u64a valid_path_mask) {
|
const u64a valid_path_mask) {
|
||||||
m256 low4bits = set32x8(0xf);
|
m256 low4bits = set32x8(0xf);
|
||||||
m256 c_lo_1 = vpshufb(lo_mask, and256(data_1, low4bits));
|
m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
|
||||||
m256 c_lo_2 = vpshufb(lo_mask, and256(data_2, low4bits));
|
m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
|
||||||
m256 c_hi_1 = vpshufb(hi_mask,
|
m256 c_hi_1 = pshufb_m256(hi_mask,
|
||||||
rshift64_m256(andnot256(low4bits, data_1), 4));
|
rshift64_m256(andnot256(low4bits, data_1), 4));
|
||||||
m256 c_hi_2 = vpshufb(hi_mask,
|
m256 c_hi_2 = pshufb_m256(hi_mask,
|
||||||
rshift64_m256(andnot256(low4bits, data_2), 4));
|
rshift64_m256(andnot256(low4bits, data_2), 4));
|
||||||
m256 t1 = and256(c_lo_1, c_hi_1);
|
m256 t1 = and256(c_lo_1, c_hi_1);
|
||||||
m256 t2 = and256(c_lo_2, c_hi_2);
|
m256 t2 = and256(c_lo_2, c_hi_2);
|
||||||
|
@ -70,7 +70,8 @@ masked_move256_len(const u8 *buf, const u32 len) {
|
|||||||
u32 end = unaligned_load_u32(buf + len - 4);
|
u32 end = unaligned_load_u32(buf + len - 4);
|
||||||
m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end));
|
m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end));
|
||||||
m256 v = _mm256_maskload_epi32((const int *)buf, lmask);
|
m256 v = _mm256_maskload_epi32((const int *)buf, lmask);
|
||||||
m256 shufend = vpshufb(preshufend, loadu256(&mm_shuffle_end[len - 4]));
|
m256 shufend = pshufb_m256(preshufend,
|
||||||
|
loadu256(&mm_shuffle_end[len - 4]));
|
||||||
m256 target = or256(v, shufend);
|
m256 target = or256(v, shufend);
|
||||||
|
|
||||||
return target;
|
return target;
|
||||||
|
@ -279,7 +279,7 @@ m128 pshufb(m128 a, m128 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m256 vpshufb(m256 a, m256 b) {
|
m256 pshufb_m256(m256 a, m256 b) {
|
||||||
#if defined(HAVE_AVX2)
|
#if defined(HAVE_AVX2)
|
||||||
return _mm256_shuffle_epi8(a, b);
|
return _mm256_shuffle_epi8(a, b);
|
||||||
#else
|
#else
|
||||||
|
Loading…
x
Reference in New Issue
Block a user