mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
remove loads from movemask128, variable_byte_shift, add palignr_imm(), minor fixes
This commit is contained in:
parent
a039089888
commit
e2f253d8ab
@ -121,16 +121,18 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
|
|||||||
return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
|
return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static really_inline u32 movemask128(m128 a) {
|
static really_inline u32 movemask128(m128 a) {
|
||||||
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||||
|
|
||||||
// Compute the mask from the input
|
// Compute the mask from the input
|
||||||
uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
|
uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
|
||||||
|
uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7);
|
||||||
|
mask = vorrq_u8(mask, mask1);
|
||||||
|
|
||||||
// Get the resulting bytes
|
// Get the resulting bytes
|
||||||
uint16_t output;
|
uint16_t output;
|
||||||
vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
|
vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0);
|
||||||
vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,14 +235,12 @@ static really_inline m128 andnot128(m128 a, m128 b) {
|
|||||||
// aligned load
|
// aligned load
|
||||||
static really_inline m128 load128(const void *ptr) {
|
static really_inline m128 load128(const void *ptr) {
|
||||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||||
ptr = assume_aligned(ptr, 16);
|
|
||||||
return (m128) vld1q_s32((const int32_t *)ptr);
|
return (m128) vld1q_s32((const int32_t *)ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
// aligned store
|
// aligned store
|
||||||
static really_inline void store128(void *ptr, m128 a) {
|
static really_inline void store128(void *ptr, m128 a) {
|
||||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||||
ptr = assume_aligned(ptr, 16);
|
|
||||||
vst1q_s32((int32_t *)ptr, a);
|
vst1q_s32((int32_t *)ptr, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,22 +270,13 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
|
||||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
|
||||||
assert(amount >= -16 && amount <= 16);
|
|
||||||
m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
|
|
||||||
return vqtbl1q_s8(in, shift_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
|
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
|
||||||
|
|
||||||
static really_inline
|
static really_really_inline
|
||||||
m128 palignr(m128 r, m128 l, int offset) {
|
m128 palignr_imm(m128 r, m128 l, int offset) {
|
||||||
#if defined(HS_OPTIMIZE)
|
|
||||||
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
|
|
||||||
#else
|
|
||||||
switch (offset) {
|
switch (offset) {
|
||||||
CASE_ALIGN_VECTORS(l, r, 0);
|
case 0: return l; break;
|
||||||
CASE_ALIGN_VECTORS(l, r, 1);
|
CASE_ALIGN_VECTORS(l, r, 1);
|
||||||
CASE_ALIGN_VECTORS(l, r, 2);
|
CASE_ALIGN_VECTORS(l, r, 2);
|
||||||
CASE_ALIGN_VECTORS(l, r, 3);
|
CASE_ALIGN_VECTORS(l, r, 3);
|
||||||
@ -301,30 +292,42 @@ m128 palignr(m128 r, m128 l, int offset) {
|
|||||||
CASE_ALIGN_VECTORS(l, r, 13);
|
CASE_ALIGN_VECTORS(l, r, 13);
|
||||||
CASE_ALIGN_VECTORS(l, r, 14);
|
CASE_ALIGN_VECTORS(l, r, 14);
|
||||||
CASE_ALIGN_VECTORS(l, r, 15);
|
CASE_ALIGN_VECTORS(l, r, 15);
|
||||||
|
case 16: return r; break;
|
||||||
default:
|
default:
|
||||||
return zeroes128();
|
return zeroes128();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static really_really_inline
|
||||||
|
m128 palignr(m128 r, m128 l, int offset) {
|
||||||
|
#if defined(HS_OPTIMIZE)
|
||||||
|
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
|
||||||
|
#else
|
||||||
|
return palignr_imm(r, l, offset);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#undef CASE_ALIGN_VECTORS
|
#undef CASE_ALIGN_VECTORS
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
||||||
if (b)
|
|
||||||
return palignr(zeroes128(), a, b);
|
return palignr(zeroes128(), a, b);
|
||||||
else
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
||||||
if (b)
|
|
||||||
return palignr(a, zeroes128(), 16 - b);
|
return palignr(a, zeroes128(), 16 - b);
|
||||||
else
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||||
|
assert(amount >= -16 && amount <= 16);
|
||||||
|
static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
|
||||||
|
const uint8x16_t outside_mask = set1_16x8(0xf0);
|
||||||
|
|
||||||
|
m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount);
|
||||||
|
return vqtbl1q_s8(in, shift_mask);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user