mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Merge pull request #119 from VectorCamp/feature/vsx-optimizations
VSX optimizations
This commit is contained in:
commit
0c97e5f2c2
@ -53,24 +53,6 @@
|
|||||||
|
|
||||||
#include <string.h> // for memcpy
|
#include <string.h> // for memcpy
|
||||||
|
|
||||||
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
|
||||||
|
|
||||||
/** \brief LUT for the mask1bit functions. */
|
|
||||||
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
ZEROES_31, 0x01, ZEROES_32,
|
|
||||||
ZEROES_31, 0x02, ZEROES_32,
|
|
||||||
ZEROES_31, 0x04, ZEROES_32,
|
|
||||||
ZEROES_31, 0x08, ZEROES_32,
|
|
||||||
ZEROES_31, 0x10, ZEROES_32,
|
|
||||||
ZEROES_31, 0x20, ZEROES_32,
|
|
||||||
ZEROES_31, 0x40, ZEROES_32,
|
|
||||||
ZEROES_31, 0x80, ZEROES_32,
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
};
|
|
||||||
|
|
||||||
static really_inline m128 ones128(void) {
|
static really_inline m128 ones128(void) {
|
||||||
return (m128) vdupq_n_s8(0xFF);
|
return (m128) vdupq_n_s8(0xFF);
|
||||||
}
|
}
|
||||||
@ -595,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
|||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
assert(n < sizeof(m128) * 8);
|
assert(n < sizeof(m128) * 8);
|
||||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
static m128 onebit = { 1, 0 };
|
||||||
mask_idx -= n / 8;
|
m128 mask = lshiftbyte_m128( onebit, n / 8 );
|
||||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
return lshift64_m128( mask, n % 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// switches on bit N in the given vector.
|
// switches on bit N in the given vector.
|
||||||
|
@ -88,6 +88,26 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
|
|||||||
#define print_m128_2x64(label, vec) ;
|
#define print_m128_2x64(label, vec) ;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
|
||||||
|
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
||||||
|
|
||||||
|
/** \brief LUT for the mask1bit functions. */
|
||||||
|
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
||||||
|
ZEROES_32, ZEROES_32,
|
||||||
|
ZEROES_31, 0x01, ZEROES_32,
|
||||||
|
ZEROES_31, 0x02, ZEROES_32,
|
||||||
|
ZEROES_31, 0x04, ZEROES_32,
|
||||||
|
ZEROES_31, 0x08, ZEROES_32,
|
||||||
|
ZEROES_31, 0x10, ZEROES_32,
|
||||||
|
ZEROES_31, 0x20, ZEROES_32,
|
||||||
|
ZEROES_31, 0x40, ZEROES_32,
|
||||||
|
ZEROES_31, 0x80, ZEROES_32,
|
||||||
|
ZEROES_32, ZEROES_32,
|
||||||
|
};
|
||||||
|
#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)
|
||||||
|
|
||||||
/****
|
/****
|
||||||
**** 256-bit Primitives
|
**** 256-bit Primitives
|
||||||
****/
|
****/
|
||||||
|
@ -54,34 +54,6 @@ typedef __vector signed char int8x16_t;
|
|||||||
|
|
||||||
typedef unsigned long long int ulong64_t;
|
typedef unsigned long long int ulong64_t;
|
||||||
typedef signed long long int long64_t;
|
typedef signed long long int long64_t;
|
||||||
/*
|
|
||||||
typedef __vector uint64_t uint64x2_t;
|
|
||||||
typedef __vector int64_t int64x2_t;
|
|
||||||
typedef __vector uint32_t uint32x4_t;
|
|
||||||
typedef __vector int32_t int32x4_t;
|
|
||||||
typedef __vector uint16_t uint16x8_t;
|
|
||||||
typedef __vector int16_t int16x8_t;
|
|
||||||
typedef __vector uint8_t uint8x16_t;
|
|
||||||
typedef __vector int8_t int8x16_t;*/
|
|
||||||
|
|
||||||
|
|
||||||
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
|
||||||
|
|
||||||
/** \brief LUT for the mask1bit functions. */
|
|
||||||
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
ZEROES_31, 0x01, ZEROES_32,
|
|
||||||
ZEROES_31, 0x02, ZEROES_32,
|
|
||||||
ZEROES_31, 0x04, ZEROES_32,
|
|
||||||
ZEROES_31, 0x08, ZEROES_32,
|
|
||||||
ZEROES_31, 0x10, ZEROES_32,
|
|
||||||
ZEROES_31, 0x20, ZEROES_32,
|
|
||||||
ZEROES_31, 0x40, ZEROES_32,
|
|
||||||
ZEROES_31, 0x80, ZEROES_32,
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
};
|
|
||||||
|
|
||||||
static really_inline m128 ones128(void) {
|
static really_inline m128 ones128(void) {
|
||||||
return (m128) vec_splat_u8(-1);
|
return (m128) vec_splat_u8(-1);
|
||||||
@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
|
|||||||
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
|
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
|
||||||
mask = vec_and(not128(mask), movemask);
|
mask = vec_and(not128(mask), movemask);
|
||||||
m128 sum = vec_sums(mask, zeroes128());
|
m128 sum = vec_sums(mask, zeroes128());
|
||||||
//sum = vec_sld(zeroes128(), sum, 4);
|
|
||||||
//s32 ALIGN_ATTR(16) x;
|
|
||||||
//vec_ste(sum, 0, &x);
|
|
||||||
//return x; // it could be ~(movemask_128(mask)) & 0x;
|
|
||||||
return sum[3];
|
return sum[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
|||||||
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
||||||
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
|
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
|
||||||
m128 sum = vec_sums((m128)mask, zeroes128());
|
m128 sum = vec_sums((m128)mask, zeroes128());
|
||||||
//sum = vec_sld(zeroes128(), sum, 4);
|
|
||||||
//s32 ALIGN_ATTR(16) x;
|
|
||||||
//vec_ste(sum, 0, &x);
|
|
||||||
//return x;
|
|
||||||
return sum[3];
|
return sum[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) {
|
|||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 lshift_m128(m128 a, unsigned b) {
|
m128 lshift_m128(m128 a, unsigned b) {
|
||||||
switch(b){
|
if (b == 0) return a;
|
||||||
case 1: return vec_sld(a, zeroes128(), 1); break;
|
m128 sl = (m128) vec_splats((uint8_t) b << 3);
|
||||||
case 2: return vec_sld(a, zeroes128(), 2); break;
|
m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
|
||||||
case 3: return vec_sld(a, zeroes128(), 3); break;
|
return result;
|
||||||
case 4: return vec_sld(a, zeroes128(), 4); break;
|
|
||||||
case 5: return vec_sld(a, zeroes128(), 5); break;
|
|
||||||
case 6: return vec_sld(a, zeroes128(), 6); break;
|
|
||||||
case 7: return vec_sld(a, zeroes128(), 7); break;
|
|
||||||
case 8: return vec_sld(a, zeroes128(), 8); break;
|
|
||||||
case 9: return vec_sld(a, zeroes128(), 9); break;
|
|
||||||
case 10: return vec_sld(a, zeroes128(), 10); break;
|
|
||||||
case 11: return vec_sld(a, zeroes128(), 11); break;
|
|
||||||
case 12: return vec_sld(a, zeroes128(), 12); break;
|
|
||||||
case 13: return vec_sld(a, zeroes128(), 13); break;
|
|
||||||
case 14: return vec_sld(a, zeroes128(), 14); break;
|
|
||||||
case 15: return vec_sld(a, zeroes128(), 15); break;
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 rshift_m128(m128 a, unsigned b) {
|
m128 rshift_m128(m128 a, unsigned b) {
|
||||||
switch(b){
|
if (b == 0) return a;
|
||||||
case 1: return vec_sld(zeroes128(), a, 15); break;
|
m128 sl = (m128) vec_splats((uint8_t) b << 3);
|
||||||
case 2: return vec_sld(zeroes128(), a, 14); break;
|
m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl);
|
||||||
case 3: return vec_sld(zeroes128(), a, 13); break;
|
return result;
|
||||||
case 4: return vec_sld(zeroes128(), a, 12); break;
|
|
||||||
case 5: return vec_sld(zeroes128(), a, 11); break;
|
|
||||||
case 6: return vec_sld(zeroes128(), a, 10); break;
|
|
||||||
case 7: return vec_sld(zeroes128(), a, 9); break;
|
|
||||||
case 8: return vec_sld(zeroes128(), a, 8); break;
|
|
||||||
case 9: return vec_sld(zeroes128(), a, 7); break;
|
|
||||||
case 10: return vec_sld(zeroes128(), a, 6); break;
|
|
||||||
case 11: return vec_sld(zeroes128(), a, 5); break;
|
|
||||||
case 12: return vec_sld(zeroes128(), a, 4); break;
|
|
||||||
case 13: return vec_sld(zeroes128(), a, 3); break;
|
|
||||||
case 14: return vec_sld(zeroes128(), a, 2); break;
|
|
||||||
case 15: return vec_sld(zeroes128(), a, 1); break;
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -212,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
|
|||||||
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static really_inline u32 movemask128(m128 a) {
|
static really_inline u32 movemask128(m128 a) {
|
||||||
uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));
|
static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
|
uint8x16_t bitmask = vec_gb((uint8x16_t) a);
|
||||||
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
|
bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
|
||||||
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
|
u32 movemask;
|
||||||
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
|
vec_ste((uint32x4_t) bitmask, 0, &movemask);
|
||||||
|
return movemask;
|
||||||
uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
|
|
||||||
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
|
|
||||||
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
|
|
||||||
|
|
||||||
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
|
|
||||||
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
|
|
||||||
|
|
||||||
uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
|
|
||||||
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
|
|
||||||
|
|
||||||
return s5[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline m128 set1_16x8(u8 c) {
|
static really_inline m128 set1_16x8(u8 c) {
|
||||||
@ -363,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
|
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -392,42 +313,50 @@ m128 palignr_imm(m128 r, m128 l, int offset) {
|
|||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 palignr(m128 r, m128 l, int offset) {
|
m128 palignr(m128 r, m128 l, int offset) {
|
||||||
#if defined(HS_OPTIMIZE)
|
if (offset == 0) return l;
|
||||||
// need a faster way to do this.
|
if (offset == 16) return r;
|
||||||
return palignr_imm(r, l, offset);
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
#else
|
if (__builtin_constant_p(offset)) {
|
||||||
return palignr_imm(r, l, offset);
|
return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
|
||||||
|
m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
|
||||||
|
m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
|
||||||
|
m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
|
||||||
|
return or128(lhs, rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CASE_ALIGN_VECTORS
|
#undef CASE_ALIGN_VECTORS
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
||||||
return rshift_m128(a,b);
|
return palignr_imm(zeroes128(), a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
||||||
return lshift_m128(a,b);
|
return palignr_imm(a, zeroes128(), 16 - b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||||
assert(amount >= -16 && amount <= 16);
|
assert(amount >= -16 && amount <= 16);
|
||||||
if (amount < 0){
|
if (amount < 0) {
|
||||||
return palignr_imm(zeroes128(), in, -amount);
|
return rshiftbyte_m128(in, -amount);
|
||||||
} else{
|
} else {
|
||||||
return palignr_imm(in, zeroes128(), 16 - amount);
|
return lshiftbyte_m128(in, amount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
assert(n < sizeof(m128) * 8);
|
assert(n < sizeof(m128) * 8);
|
||||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
static uint64x2_t onebit = { 1, 0 };
|
||||||
mask_idx -= n / 8;
|
m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
|
||||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
|
||||||
|
m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
|
||||||
|
return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
|
||||||
}
|
}
|
||||||
|
|
||||||
// switches on bit N in the given vector.
|
// switches on bit N in the given vector.
|
||||||
|
@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) {
|
|||||||
return _mm_set_epi64x(0LL, *p);
|
return _mm_set_epi64x(0LL, *p);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
|
#define CASE_RSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break;
|
||||||
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
|
|
||||||
|
static really_inline
|
||||||
|
m128 rshiftbyte_m128(const m128 a, int count_immed) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(count_immed)) {
|
||||||
|
return _mm_srli_si128(a, count_immed);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
switch (count_immed) {
|
||||||
|
case 0: return a; break;
|
||||||
|
CASE_RSHIFT_VECTOR(a, 1);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 2);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 3);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 4);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 5);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 6);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 7);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 8);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 9);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 10);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 11);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 12);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 13);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 14);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 15);
|
||||||
|
default: return zeroes128(); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#undef CASE_RSHIFT_VECTOR
|
||||||
|
|
||||||
|
#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_slli_si128((m128)(a), (count)); break;
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 lshiftbyte_m128(const m128 a, int count_immed) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(count_immed)) {
|
||||||
|
return _mm_slli_si128(a, count_immed);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
switch (count_immed) {
|
||||||
|
case 0: return a; break;
|
||||||
|
CASE_LSHIFT_VECTOR(a, 1);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 2);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 3);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 4);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 5);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 6);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 7);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 8);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 9);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 10);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 11);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 12);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 13);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 14);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 15);
|
||||||
|
default: return zeroes128(); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#undef CASE_LSHIFT_VECTOR
|
||||||
|
|
||||||
#if defined(HAVE_SSE41)
|
#if defined(HAVE_SSE41)
|
||||||
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
|
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
|
||||||
@ -255,14 +314,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
|
|||||||
memcpy(&a, ptr, n);
|
memcpy(&a, ptr, n);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
extern const u8 simd_onebit_masks[];
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif*/
|
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
@ -330,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#undef CASE_ALIGN_VECTORS
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 palignr(m128 r, m128 l, int offset) {
|
m128 palignr(m128 r, m128 l, int offset) {
|
||||||
@ -340,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) {
|
|||||||
#endif
|
#endif
|
||||||
return palignr_sw(r, l, offset);
|
return palignr_sw(r, l, offset);
|
||||||
}
|
}
|
||||||
#undef CASE_ALIGN_VECTORS
|
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||||
|
@ -189,10 +189,7 @@ public:
|
|||||||
size_t sum = 0;
|
size_t sum = 0;
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i + 4 <= num_blocks; i += 4) {
|
for (; i + 4 <= num_blocks; i += 4) {
|
||||||
sum += popcount64(bits[i]);
|
sum += popcount64x4(&bits[i]);
|
||||||
sum += popcount64(bits[i + 1]);
|
|
||||||
sum += popcount64(bits[i + 2]);
|
|
||||||
sum += popcount64(bits[i + 3]);
|
|
||||||
}
|
}
|
||||||
for (; i < num_blocks; i++) {
|
for (; i < num_blocks; i++) {
|
||||||
sum += popcount64(bits[i]);
|
sum += popcount64(bits[i]);
|
||||||
|
@ -52,6 +52,15 @@ u32 popcount32(u32 x) {
|
|||||||
// #endif
|
// #endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
u32 popcount32x4(u32 const *x) {
|
||||||
|
u32 sum = popcount32(x[0]);
|
||||||
|
sum += popcount32(x[1]);
|
||||||
|
sum += popcount32(x[2]);
|
||||||
|
sum += popcount32(x[3]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
u32 popcount64(u64a x) {
|
u32 popcount64(u64a x) {
|
||||||
return __builtin_popcountll(x);
|
return __builtin_popcountll(x);
|
||||||
@ -73,5 +82,14 @@ u32 popcount64(u64a x) {
|
|||||||
// #endif
|
// #endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
u32 popcount64x4(u64a const *x) {
|
||||||
|
volatile u32 sum = popcount64(x[0]);
|
||||||
|
sum += popcount64(x[1]);
|
||||||
|
sum += popcount64(x[2]);
|
||||||
|
sum += popcount64(x[3]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* UTIL_POPCOUNT_H_ */
|
#endif /* UTIL_POPCOUNT_H_ */
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@
|
|||||||
#include "util/supervector/supervector.hpp"
|
#include "util/supervector/supervector.hpp"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
// 128-bit Powerpc64le implementation
|
// 128-bit IBM Power VSX implementation
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
||||||
@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
|||||||
u.v128[0] = other.u.v128[0];
|
u.v128[0] = other.u.v128[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(char __bool __vector v)
|
||||||
|
{
|
||||||
|
u.u8x16[0] = (uint8x16_t) v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int8x16_t const v)
|
||||||
|
{
|
||||||
|
u.s8x16[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
|
||||||
|
{
|
||||||
|
u.u8x16[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int16x8_t const v)
|
||||||
|
{
|
||||||
|
u.s16x8[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
|
||||||
|
{
|
||||||
|
u.u16x8[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int32x4_t const v)
|
||||||
|
{
|
||||||
|
u.s32x4[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
|
||||||
|
{
|
||||||
|
u.u32x4[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int64x2_t const v)
|
||||||
|
{
|
||||||
|
u.s64x2[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
|
||||||
|
{
|
||||||
|
u.u64x2[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
|
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
|
||||||
{
|
{
|
||||||
@ -57,69 +120,69 @@ template<>
|
|||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int8_t const other)
|
really_inline SuperVector<16>::SuperVector(int8_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s8x16[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint8_t const other)
|
really_inline SuperVector<16>::SuperVector(uint8_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
|
u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int16_t const other)
|
really_inline SuperVector<16>::SuperVector(int16_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s16x8[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint16_t const other)
|
really_inline SuperVector<16>::SuperVector(uint16_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
|
u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int32_t const other)
|
really_inline SuperVector<16>::SuperVector(int32_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s32x4[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint32_t const other)
|
really_inline SuperVector<16>::SuperVector(uint32_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
|
u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int64_t const other)
|
really_inline SuperVector<16>::SuperVector(int64_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
|
u.s64x2[0] = (int64x2_t) vec_splats(static_cast<ulong64_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint64_t const other)
|
really_inline SuperVector<16>::SuperVector(uint64_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
|
u.u64x2[0] = (uint64x2_t) vec_splats(static_cast<ulong64_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Constants
|
// Constants
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
||||||
{
|
{
|
||||||
return {(m128) vec_splat_s8(-1)};
|
return { vec_splat_s8(-1)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
||||||
{
|
{
|
||||||
return {(m128) vec_splat_s8(0)};
|
return { vec_splat_s8(0) };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Methods
|
// Methods
|
||||||
@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {vec_and(u.v128[0], b.u.v128[0])};
|
return { vec_and(u.v128[0], b.u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {vec_or(u.v128[0], b.u.v128[0])};
|
return { vec_or(u.v128[0], b.u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_xor(u.v128[0], b.u.v128[0])};
|
return { vec_xor(u.v128[0], b.u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator!() const
|
really_inline SuperVector<16> SuperVector<16>::operator!() const
|
||||||
{
|
{
|
||||||
return {(m128) vec_xor(u.v128[0], u.v128[0])};
|
return { vec_xor(u.v128[0], u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
|
int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
|
||||||
return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
|
return { vec_and(not_res, b.u.s8x16[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
|
return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])};
|
return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};
|
return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};
|
return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};
|
return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
|||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::comparemask_type
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
SuperVector<16>::comparemask(void) const {
|
SuperVector<16>::comparemask(void) const {
|
||||||
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
|
uint8x16_t bitmask = vec_gb( u.u8x16[0]);
|
||||||
|
static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
|
bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
|
||||||
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
|
u32 movemask;
|
||||||
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
|
vec_ste((uint32x4_t) bitmask, 0, &movemask);
|
||||||
|
return movemask;
|
||||||
uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
|
|
||||||
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
|
|
||||||
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
|
|
||||||
|
|
||||||
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
|
|
||||||
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
|
|
||||||
|
|
||||||
uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
|
|
||||||
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
|
|
||||||
|
|
||||||
return s5[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -248,35 +296,35 @@ template <>
|
|||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) };
|
return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
|
return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
|
return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)};
|
return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -290,35 +338,35 @@ template <>
|
|||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
|
return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) };
|
return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
|
return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -348,50 +396,40 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint8x16_t shift_indices = vec_splats((uint8_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u8x16[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint16x8_t shift_indices = vec_splats((uint16_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u16x8[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint32x4_t shift_indices = vec_splats((uint32_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u32x4[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint64x2_t shift_indices = vec_splats((ulong64_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u64x2[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
SuperVector sl{N << 3};
|
||||||
SuperVector result;
|
return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -404,50 +442,40 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint8x16_t shift_indices = vec_splats((uint8_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u8x16[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint16x8_t shift_indices = vec_splats((uint16_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u16x8[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint32x4_t shift_indices = vec_splats((uint32_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u32x4[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint64x2_t shift_indices = vec_splats((ulong64_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u64x2[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
SuperVector sr{N << 3};
|
||||||
SuperVector result;
|
return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -459,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||||
{
|
{
|
||||||
switch(N) {
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break;
|
if (N == 0) return *this;
|
||||||
case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break;
|
if (__builtin_constant_p(N)) {
|
||||||
case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break;
|
return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
||||||
case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break;
|
|
||||||
case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break;
|
|
||||||
case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break;
|
|
||||||
case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break;
|
|
||||||
case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break;
|
|
||||||
case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break;
|
|
||||||
case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break;
|
|
||||||
case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break;
|
|
||||||
case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break;
|
|
||||||
case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break;
|
|
||||||
case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break;
|
|
||||||
case 16: return Zeroes(); break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
return vshr_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||||
{
|
{
|
||||||
switch(N) {
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break;
|
if (N == 0) return *this;
|
||||||
case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break;
|
if (__builtin_constant_p(N)) {
|
||||||
case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break;
|
return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
|
||||||
case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break;
|
|
||||||
case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break;
|
|
||||||
case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break;
|
|
||||||
case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break;
|
|
||||||
case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break;
|
|
||||||
case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break;
|
|
||||||
case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break;
|
|
||||||
case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break;
|
|
||||||
case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break;
|
|
||||||
case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break;
|
|
||||||
case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break;
|
|
||||||
case 16: return Zeroes(); break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
return vshl_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -521,50 +523,39 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
|
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
|
||||||
{
|
{
|
||||||
return (m128) vec_xl(0, (const long64_t*)ptr);
|
return { vec_xl(0, (const long64_t*)ptr) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
||||||
{
|
{
|
||||||
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
|
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
|
||||||
return (m128) vec_xl(0, (const long64_t*)ptr);
|
return { vec_xl(0, (const long64_t*)ptr) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||||
{
|
{
|
||||||
SuperVector<16> mask = Ones_vshr(16 -len);
|
SuperVector<16> mask = Ones_vshr(16 -len);
|
||||||
mask.print8("mask");
|
|
||||||
SuperVector<16> v = loadu(ptr);
|
SuperVector<16> v = loadu(ptr);
|
||||||
v.print8("v");
|
|
||||||
return mask & v;
|
return mask & v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
||||||
{
|
{
|
||||||
|
if (offset == 0) return other;
|
||||||
switch(offset) {
|
if (offset == 16) return *this;
|
||||||
case 0: return other; break;
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break;
|
if (__builtin_constant_p(offset)) {
|
||||||
case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break;
|
return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) };
|
||||||
case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break;
|
|
||||||
case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break;
|
|
||||||
case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break;
|
|
||||||
case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break;
|
|
||||||
case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break;
|
|
||||||
case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break;
|
|
||||||
case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break;
|
|
||||||
case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break;
|
|
||||||
case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break;
|
|
||||||
case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break;
|
|
||||||
case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break;
|
|
||||||
case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
uint8x16_t sl = vec_splats((uint8_t) (offset << 3));
|
||||||
|
uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3));
|
||||||
|
uint8x16_t rhs = vec_slo(u.u8x16[0], sr);
|
||||||
|
uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl);
|
||||||
|
return { vec_or(lhs, rhs) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -574,9 +565,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
|
|||||||
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
||||||
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
|
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
|
||||||
below is the version that is converted from Intel to PPC. */
|
below is the version that is converted from Intel to PPC. */
|
||||||
uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
|
uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
|
||||||
uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
|
uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
|
||||||
return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
|
return { vec_sel(res, vec_splat_u8(0), mask) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
@ -204,7 +204,7 @@ public:
|
|||||||
SuperVector(typename base_type::type const v);
|
SuperVector(typename base_type::type const v);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
SuperVector(T other);
|
SuperVector(T const other);
|
||||||
|
|
||||||
SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
|
SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
|
||||||
SuperVector(previous_type const lo, previous_type const hi);
|
SuperVector(previous_type const lo, previous_type const hi);
|
||||||
|
@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define TEST_LSHIFTBYTE128(v1, buf, l) { \
|
||||||
|
m128 v_shifted = lshiftbyte_m128(v1, l); \
|
||||||
|
storeu128(res, v_shifted); \
|
||||||
|
int i; \
|
||||||
|
for (i=0; i < l; i++) { \
|
||||||
|
assert(res[i] == 0); \
|
||||||
|
} \
|
||||||
|
for (; i < 16; i++) { \
|
||||||
|
assert(res[i] == vec[i - l]); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SimdUtilsTest, lshiftbyte128){
|
||||||
|
u8 vec[16];
|
||||||
|
u8 res[16];
|
||||||
|
for (int i=0; i<16; i++) {
|
||||||
|
vec[i]=i;
|
||||||
|
}
|
||||||
|
m128 v1 = loadu128(vec);
|
||||||
|
for (int j = 0; j<16; j++){
|
||||||
|
TEST_LSHIFTBYTE128(v1, vec, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TEST_RSHIFTBYTE128(v1, buf, l) { \
|
||||||
|
m128 v_shifted = rshiftbyte_m128(v1, l); \
|
||||||
|
storeu128(res, v_shifted); \
|
||||||
|
int i; \
|
||||||
|
for (i=15; i >= 16 - l; i--) { \
|
||||||
|
assert(res[i] == 0); \
|
||||||
|
} \
|
||||||
|
for (; i >= 0; i--) { \
|
||||||
|
assert(res[i] == vec[i + l]); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SimdUtilsTest, rshiftbyte128){
|
||||||
|
u8 vec[16];
|
||||||
|
u8 res[16];
|
||||||
|
for (int i=0; i<16; i++) {
|
||||||
|
vec[i]=i;
|
||||||
|
}
|
||||||
|
m128 v1 = loadu128(vec);
|
||||||
|
for (int j = 0; j<16; j++){
|
||||||
|
TEST_RSHIFTBYTE128(v1, vec, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(SimdUtilsTest, variableByteShift128) {
|
TEST(SimdUtilsTest, variableByteShift128) {
|
||||||
char base[] = "0123456789ABCDEF";
|
char base[] = "0123456789ABCDEF";
|
||||||
m128 in = loadu128(base);
|
m128 in = loadu128(base);
|
||||||
|
|
||||||
|
|
||||||
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
|
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
|
||||||
variable_byte_shift_m128(in, 0)));
|
variable_byte_shift_m128(in, 0)));
|
||||||
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
|
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
|
||||||
@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) {
|
|||||||
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
|
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
|
||||||
variable_byte_shift_m128(in, 10)));
|
variable_byte_shift_m128(in, 10)));
|
||||||
|
|
||||||
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
|
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15)));
|
||||||
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
|
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user