Add SVE2 support for dvermicelli

Change-Id: I056ef15e162ab6fb1f78964321ce893f4096367e
This commit is contained in:
George Wort
2021-06-23 14:14:28 +01:00
committed by Konstantinos Margaritis
parent 3296d538ea
commit b6a7ee7e84
6 changed files with 532 additions and 317 deletions

View File

@@ -48,82 +48,6 @@
#include "vermicelli_sse.h"
#endif
static really_inline
const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
assert(buf < buf_end);
VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
#ifdef HAVE_AVX512
if (buf_end - buf <= VERM_BOUNDARY) {
const u8 *ptr = nocase
? dvermMiniNocase(chars1, chars2, buf, buf_end)
: dvermMini(chars1, chars2, buf, buf_end);
if (ptr) {
return ptr;
}
/* check for partial match at end */
u8 mask = nocase ? CASE_CLEAR : 0xff;
if ((buf_end[-1] & mask) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
#endif
assert((buf_end - buf) >= VERM_BOUNDARY);
uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
if (min) {
// Input isn't aligned, so we need to run one iteration with an
// unaligned load, then skip buf forward to the next aligned address.
// There's some small overlap here, but we don't mind scanning it twice
// if we can do it quickly, do we?
const u8 *ptr = nocase
? dvermPreconditionNocase(chars1, chars2, buf)
: dvermPrecondition(chars1, chars2, buf);
if (ptr) {
return ptr;
}
buf += VERM_BOUNDARY - min;
assert(buf < buf_end);
}
// Aligned loops from here on in
const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
buf, buf_end)
: dvermSearchAligned(chars1, chars2, c1, c2, buf,
buf_end);
if (ptr) {
return ptr;
}
// Tidy up the mess at the end
ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
buf_end - VERM_BOUNDARY)
: dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
if (ptr) {
return ptr;
}
/* check for partial match at end */
u8 mask = nocase ? CASE_CLEAR : 0xff;
if ((buf_end[-1] & mask) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
static really_inline
const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
const u8 *buf, const u8 *buf_end) {
@@ -194,60 +118,4 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
return buf_end;
}
/* returns highest offset of c2 (NOTE: not c1) */
static really_inline
const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
const u8 *buf_end) {
DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
assert(buf < buf_end);
VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
#ifdef HAVE_AVX512
if (buf_end - buf <= VERM_BOUNDARY) {
const u8 *ptr = nocase
? rdvermMiniNocase(chars1, chars2, buf, buf_end)
: rdvermMini(chars1, chars2, buf, buf_end);
if (ptr) {
return ptr;
}
// check for partial match at end ???
return buf - 1;
}
#endif
assert((buf_end - buf) >= VERM_BOUNDARY);
size_t min = (size_t)buf_end % VERM_BOUNDARY;
if (min) {
// input not aligned, so we need to run one iteration with an unaligned
// load, then skip buf forward to the next aligned address. There's
// some small overlap here, but we don't mind scanning it twice if we
// can do it quickly, do we?
const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
buf_end - VERM_BOUNDARY)
: rdvermPrecondition(chars1, chars2,
buf_end - VERM_BOUNDARY);
if (ptr) {
return ptr;
}
buf_end -= min;
if (buf >= buf_end) {
return buf_end;
}
}
// Aligned loops from here on in
if (nocase) {
return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
} else {
return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
}
}
#endif /* VERMICELLI_H */