mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
add ARM simd_utils vectorized functions for 128-bit vectors
This commit is contained in:
parent
5b425bd5a6
commit
c5a7f4b846
@ -95,7 +95,18 @@ static really_inline m128 eq128(m128 a, m128 b) {
|
||||
return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
||||
static really_inline u32 movemask128(m128 a) {
|
||||
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||
|
||||
// Compute the mask from the input
|
||||
uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
|
||||
|
||||
// Get the resulting bytes
|
||||
uint16_t output;
|
||||
vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
|
||||
vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
|
||||
return output;
|
||||
}
|
||||
|
||||
static really_inline m128 set1_16x8(u8 c) {
|
||||
return (m128) vdupq_n_u8(c);
|
||||
@ -229,21 +240,22 @@ void clearbit128(m128 *ptr, unsigned int n) {
|
||||
static really_inline
|
||||
char testbit128(m128 val, unsigned int n) {
|
||||
const m128 mask = mask1bit128(n);
|
||||
#if defined(HAVE_SSE41)
|
||||
return !_mm_testz_si128(mask, val);
|
||||
#else
|
||||
|
||||
return isnonzero128(and128(mask, val));
|
||||
#endif
|
||||
}
|
||||
|
||||
// offset must be an immediate
|
||||
#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
|
||||
static really_inline
|
||||
m128 palignr(m128 r, m128 l, int offset) {
|
||||
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 pshufb_m128(m128 a, m128 b) {
|
||||
m128 result;
|
||||
result = _mm_shuffle_epi8(a, b);
|
||||
return result;
|
||||
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
||||
In NEON, if >=16, then the result is zero, otherwise it is that lane.
|
||||
btranslated is the version that is converted from Intel to NEON. */
|
||||
int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
|
||||
return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
|
Loading…
x
Reference in New Issue
Block a user