mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
shufti: slightly faster short shufti operation
It is better to shift the high lane values in an XMM before then using insert to combine the high and low lanes.
This commit is contained in:
parent
99e14df117
commit
5a842caaf1
@ -307,8 +307,7 @@ static really_inline
|
||||
const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
|
||||
const m256 low4bits) {
|
||||
// do the hi and lo shuffles in the one avx register
|
||||
m256 c = set2x128(chars);
|
||||
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
|
||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||
c = and256(c, low4bits);
|
||||
m256 c_shuf = vpshufb(mask, c);
|
||||
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
||||
@ -439,8 +438,7 @@ static really_inline
|
||||
const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
|
||||
const m256 low4bits) {
|
||||
// do the hi and lo shuffles in the one avx register
|
||||
m256 c = set2x128(chars);
|
||||
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
|
||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||
c = and256(c, low4bits);
|
||||
m256 c_shuf = vpshufb(mask, c);
|
||||
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
|
||||
@ -564,8 +562,7 @@ static really_inline
|
||||
const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
|
||||
const m256 low4bits) {
|
||||
// do the hi and lo shuffles in the one avx register
|
||||
m256 c = set2x128(chars);
|
||||
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
|
||||
m256 c = combine2x128(rshift64_m128(chars, 4), chars);
|
||||
c = and256(c, low4bits);
|
||||
m256 c_shuf1 = vpshufb(mask1, c);
|
||||
m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);
|
||||
|
Loading…
x
Reference in New Issue
Block a user