lookaround:

add 64x8 and 64x16 shufti models
add mask64 model
expand entry quantity
This commit is contained in:
Hong, Yang A
2020-10-20 20:34:50 +00:00
committed by Konstantinos Margaritis
parent 56cb107005
commit dea7c4dc2e
12 changed files with 892 additions and 46 deletions

View File

@@ -424,6 +424,11 @@ static really_inline m256 loadu256(const void *ptr) {
return _mm256_loadu_si256((const m256 *)ptr);
}
static really_inline
m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
return _mm256_maskz_loadu_epi8(k, ptr);
}
// unaligned store
static really_inline void storeu256(void *ptr, m256 a) {
_mm256_storeu_si256((m256 *)ptr, a);
@@ -712,6 +717,22 @@ m512 loadu512(const void *ptr) {
return _mm512_loadu_si512(ptr);
}
// unaligned store
static really_inline
void storeu512(void *ptr, m512 a) {
#if defined(HAVE_AVX512)
_mm512_storeu_si512((m512 *)ptr, a);
#elif defined(HAVE_AVX2)
storeu256(ptr, a.lo);
storeu256((char *)ptr + 32, a.hi);
#else
storeu128(ptr, a.lo.lo);
storeu128((char *)ptr + 16, a.lo.hi);
storeu128((char *)ptr + 32, a.hi.lo);
storeu128((char *)ptr + 48, a.hi.hi);
#endif
}
static really_inline
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
return _mm512_maskz_loadu_epi8(k, ptr);
@@ -722,6 +743,11 @@ m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
return _mm512_mask_loadu_epi8(src, k, ptr);
}
static really_inline
void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
_mm512_mask_storeu_epi8(ptr, k, a);
}
static really_inline
m512 set_mask_m512(__mmask64 k) {
return _mm512_movm_epi8(k);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, Intel Corporation
* Copyright (c) 2016-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@
#include "simd_utils.h"
static really_inline
void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) {
switch (len) {
case 0:
break;
@@ -72,14 +72,41 @@ void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
case 16:
storeu128(dst, loadu128(src));
break;
case 32:
storeu256(dst, loadu256(src));
break;
default:
assert(len < 32);
case 17:
case 18:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
case 28:
case 29:
case 30:
case 31:
storeu128(dst + len - 16, loadu128(src + len - 16));
storeu128(dst, loadu128(src));
break;
case 32:
storeu256(dst, loadu256(src));
break;
#ifdef HAVE_AVX512
case 64:
storebytes512(dst, loadu512(src), 64);
break;
default:
assert(len < 64);
u64a k = (1ULL << len) - 1;
storeu_mask_m512(dst, k, loadu_maskz_m512(k, src));
break;
#else
default:
assert(0);
break;
#endif
}
}