diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 606892fb..74f447fb 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -95,7 +95,18 @@ static really_inline m128 eq128(m128 a, m128 b) { return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); } -#define movemask128(a) ((u32)_mm_movemask_epi8((a))) +static really_inline u32 movemask128(m128 a) { + static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Compute the mask from the input + uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0); + vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8); + return output; +} static really_inline m128 set1_16x8(u8 c) { return (m128) vdupq_n_u8(c); @@ -229,21 +240,22 @@ void clearbit128(m128 *ptr, unsigned int n) { static really_inline char testbit128(m128 val, unsigned int n) { const m128 mask = mask1bit128(n); -#if defined(HAVE_SSE41) - return !_mm_testz_si128(mask, val); -#else + return isnonzero128(and128(mask, val)); -#endif } -// offset must be an immediate -#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) +static really_inline +m128 palignr(m128 r, m128 l, int offset) { + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +} static really_inline m128 pshufb_m128(m128 a, m128 b) { - m128 result; - result = _mm_shuffle_epi8(a, b); - return result; + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); } static really_inline