shufti: slightly faster short shufti operation

It is better to shift the high lane values in an XMM
before then using insert to combine the high and low lanes.
This commit is contained in:
Matthew Barr 2016-09-13 15:07:31 +10:00
parent 99e14df117
commit 5a842caaf1

View File

@ -307,8 +307,7 @@ static really_inline
const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
const m256 low4bits) { const m256 low4bits) {
// do the hi and lo shuffles in the one avx register // do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars); m256 c = combine2x128(rshift64_m128(chars, 4), chars);
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits); c = and256(c, low4bits);
m256 c_shuf = vpshufb(mask, c); m256 c_shuf = vpshufb(mask, c);
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
@ -439,8 +438,7 @@ static really_inline
const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
const m256 low4bits) { const m256 low4bits) {
// do the hi and lo shuffles in the one avx register // do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars); m256 c = combine2x128(rshift64_m128(chars, 4), chars);
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits); c = and256(c, low4bits);
m256 c_shuf = vpshufb(mask, c); m256 c_shuf = vpshufb(mask, c);
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
@ -564,8 +562,7 @@ static really_inline
const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
const m256 low4bits) { const m256 low4bits) {
// do the hi and lo shuffles in the one avx register // do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars); m256 c = combine2x128(rshift64_m128(chars, 4), chars);
c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits); c = and256(c, low4bits);
m256 c_shuf1 = vpshufb(mask1, c); m256 c_shuf1 = vpshufb(mask1, c);
m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1); m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);