shufti: slightly faster short shufti operation

It is better to shift the high lane values in an XMM before then using insert to combine the high and low lanes.
2025-06-28 16:41:01 +03:00 · 2016-09-13 15:07:31 +10:00 · 2016-09-13 15:07:31 +10:00 · 5a842caaf1
commit 5a842caaf1
parent 99e14df117
1 changed files with 3 additions and 6 deletions
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@ -307,8 +307,7 @@ static really_inline
 const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
                        const m256 low4bits) {
    // do the hi and lo shuffles in the one avx register
-    m256 c = set2x128(chars);
-    c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
    c = and256(c, low4bits);
    m256 c_shuf = vpshufb(mask, c);
    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
@ -439,8 +438,7 @@ static really_inline
 const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
                        const m256 low4bits) {
    // do the hi and lo shuffles in the one avx register
-    m256 c = set2x128(chars);
-    c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
    c = and256(c, low4bits);
    m256 c_shuf = vpshufb(mask, c);
    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
@ -564,8 +562,7 @@ static really_inline
 const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
                         const m256 low4bits) {
    // do the hi and lo shuffles in the one avx register
-    m256 c = set2x128(chars);
-    c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
+    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
    c = and256(c, low4bits);
    m256 c_shuf1 = vpshufb(mask1, c);
    m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);