diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index f7b92e70..dcf3fe58 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -202,6 +202,18 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { #endif } +static really_inline m128 low64from128(const m128 in) { + return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 high64from128(const m128 in) { + return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 add128(m128 a, m128 b) { + return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); +} + static really_inline m128 and128(m128 a, m128 b) { return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); } @@ -381,13 +393,13 @@ m128 sub_u8_m128(m128 a, m128 b) { static really_inline m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { - uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 }; + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; return (m128) vld1q_u32((uint32_t *) data); } static really_inline m128 set2x64(u64a hi, u64a lo) { - uint64_t __attribute__((aligned(16))) data[2] = { lo, hi }; + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; return (m128) vld1q_u64((uint64_t *) data); } diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 0c67ee94..b20becdc 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -46,7 +46,7 @@ #ifdef DEBUG static inline void print_m128_16x8(char *label, m128 vector) { - uint8_t __attribute__((aligned(16))) data[16]; + uint8_t ALIGN_ATTR(16) data[16]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 16; i++) @@ -55,7 +55,7 @@ static inline void print_m128_16x8(char *label, m128 vector) { } static inline void print_m128_8x16(char *label, m128 vector) { - uint16_t __attribute__((aligned(16))) data[8]; + uint16_t ALIGN_ATTR(16) data[8]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 8; i++) @@ -64,7 +64,7 @@ static inline void print_m128_8x16(char *label, m128 vector) { } static inline void print_m128_4x32(char *label, m128 vector) { - uint32_t __attribute__((aligned(16))) data[4]; + uint32_t ALIGN_ATTR(16) data[4]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 4; i++) @@ -73,7 +73,7 @@ static inline void print_m128_4x32(char *label, m128 vector) { } static inline void print_m128_2x64(char *label, m128 vector) { - uint64_t __attribute__((aligned(16))) data[2]; + uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 2; i++) @@ -146,6 +146,13 @@ static really_inline m256 ones256(void) { return rv; } +static really_inline m256 add256(m256 a, m256 b) { + m256 rv; + rv.lo = add128(a.lo, b.lo); + rv.hi = add128(a.hi, b.hi); + return rv; +} + static really_inline m256 and256(m256 a, m256 b) { m256 rv; rv.lo = and128(a.lo, b.lo); @@ -585,6 +592,13 @@ m512 set1_4x128(m128 a) { return rv; } +static really_inline +m512 add512(m512 a, m512 b) { + m512 rv; + rv.lo = add256(a.lo, b.lo); + rv.hi = add256(a.hi, b.hi); + return rv; +} static really_inline m512 and512(m512 a, m512 b) {