MCSHENG64: extend to 64-state based on mcsheng

This commit is contained in:
Zhu,Wenjun
2020-09-08 14:59:33 +00:00
committed by Konstantinos Margaritis
parent dea7c4dc2e
commit d96f1ab505
15 changed files with 2334 additions and 15 deletions

View File

@@ -108,6 +108,12 @@ m128 lshift64_m128(m128 a, unsigned b) {
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
#if defined(HAVE_AVX512)
static really_inline m128 cast512to128(const m512 in) {
return _mm512_castsi512_si128(in);
}
#endif
static really_inline m128 set1_16x8(u8 c) {
return _mm_set1_epi8(c);
}
@@ -165,6 +171,10 @@ m128 load_m128_from_u64a(const u64a *p) {
#endif // !AVX2
static really_inline m128 add128(m128 a, m128 b) {
return _mm_add_epi64(a, b);
}
static really_inline m128 and128(m128 a, m128 b) {
return _mm_and_si128(a,b);
}
@@ -352,6 +362,10 @@ static really_inline m256 ones256(void) {
return rv;
}
static really_inline m256 add256(m256 a, m256 b) {
return _mm256_add_epi64(a, b);
}
static really_inline m256 and256(m256 a, m256 b) {
return _mm256_and_si256(a, b);
}
@@ -562,6 +576,12 @@ static really_inline u32 movd512(const m512 in) {
return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
}
static really_inline u64a movq512(const m512 in) {
// NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
// so we use 2-step convertions to work around.
return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
}
static really_inline
m512 pshufb_m512(m512 a, m512 b) {
return _mm512_shuffle_epi8(a, b);
@@ -606,6 +626,11 @@ m512 set1_8x64(u64a a) {
return _mm512_set1_epi64(a);
}
static really_inline
m512 set16x32(u32 a) {
return _mm512_set1_epi32(a);
}
static really_inline
m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
@@ -624,6 +649,31 @@ m512 set1_4x128(m128 a) {
return _mm512_broadcast_i32x4(a);
}
static really_inline
m512 sadd_u8_m512(m512 a, m512 b) {
return _mm512_adds_epu8(a, b);
}
static really_inline
m512 max_u8_m512(m512 a, m512 b) {
return _mm512_max_epu8(a, b);
}
static really_inline
m512 min_u8_m512(m512 a, m512 b) {
return _mm512_min_epu8(a, b);
}
static really_inline
m512 sub_u8_m512(m512 a, m512 b) {
return _mm512_sub_epi8(a, b);
}
static really_inline m512
add512(m512 a, m512 b) {
return _mm512_add_epu64(a, b);
}
static really_inline
m512 and512(m512 a, m512 b) {
return _mm512_and_si512(a, b);