diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 57890478..2e63be9f 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -308,7 +308,7 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = set2x128(chars); - c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4)); + c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0)); c = and256(c, low4bits); m256 c_shuf = vpshufb(mask, c); m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); @@ -440,7 +440,7 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = set2x128(chars); - c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4)); + c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0)); c = and256(c, low4bits); m256 c_shuf = vpshufb(mask, c); m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); @@ -565,7 +565,7 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = set2x128(chars); - c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4)); + c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0)); c = and256(c, low4bits); m256 c_shuf1 = vpshufb(mask1, c); m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1); diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 43df7962..f074973d 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -3054,8 +3054,8 @@ bool makeRoleShufti(const vector &look, neg_mask &= 0xffff; array nib_mask; array bucket_select_mask_16; - copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin()); - copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin() + 16); + copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin()); + copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16); copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16, bucket_select_mask_16.begin()); auto ri = make_unique diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index afa8c7f8..35e1a390 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -658,8 +658,8 @@ m128 movdq_lo(m256 x) { } static really_inline -m256 combine2x128(m128 a, m128 b) { - m256 rv = {a, b}; +m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; return rv; } @@ -712,7 +712,7 @@ m256 combine2x128(m128 hi, m128 lo) { #if defined(_mm256_set_m128i) return _mm256_set_m128i(hi, lo); #else - return insert128to256(cast128to256(hi), lo, 1); + return insert128to256(cast128to256(lo), hi, 1); #endif } #endif //AVX2