From 88a18dcf980e9c8496041a6010fc25f6548188ae Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Tue, 28 Apr 2020 10:15:40 +0000 Subject: [PATCH] add AVX512 support for vermicelli model --- src/nfa/vermicelli.h | 179 ++++++++++---- src/nfa/vermicelli_sse.h | 498 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 634 insertions(+), 43 deletions(-) diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h index 817e681a..ed797d83 100644 --- a/src/nfa/vermicelli.h +++ b/src/nfa/vermicelli.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -46,7 +46,20 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf, nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 0) + : vermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf_end; + } +#else if (buf_end - buf < VERM_BOUNDARY) { for (; buf < buf_end; buf++) { char cur = (char)*buf; @@ -59,8 +72,8 @@ const u8 *vermicelliExec(char c, char nocase, const u8 *buf, } return buf; } +#endif - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; if (min) { // Input isn't aligned, so we need to run one iteration with an @@ -99,7 +112,20 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 1) + : vermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf_end; + } +#else if (buf_end - buf < VERM_BOUNDARY) { for (; buf < buf_end; buf++) { char cur = (char)*buf; @@ -112,8 +138,8 @@ const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, } return buf; } +#endif - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ size_t min = (size_t)buf % VERM_BOUNDARY; if (min) { // Input isn't aligned, so we need to run one iteration with an @@ -149,12 +175,32 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); assert(buf < buf_end); - assert((buf_end - buf) >= VERM_BOUNDARY); - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? dvermMiniNocase(chars1, chars2, buf, buf_end) + : dvermMini(chars1, chars2, buf, buf_end); + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; if (min) { // Input isn't aligned, so we need to run one iteration with an // unaligned load, then skip buf forward to the next aligned address. @@ -205,14 +251,32 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) " "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf)); assert(buf < buf_end); - assert((buf_end - buf) >= VERM_BOUNDARY); - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; VERM_TYPE chars1 = VERM_SET_FN(c1); VERM_TYPE chars2 = VERM_SET_FN(c2); VERM_TYPE mask1 = VERM_SET_FN(m1); VERM_TYPE mask2 = VERM_SET_FN(m2); +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf, + buf_end); + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + if ((buf_end[-1] & m1) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; if (min) { // Input isn't aligned, so we need to run one iteration with an // unaligned load, then skip buf forward to the next aligned address. @@ -244,6 +308,7 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, /* check for partial match at end */ if ((buf_end[-1] & m1) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); return buf_end - 1; } @@ -259,7 +324,20 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 0) + : rvermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else if (buf_end - buf < VERM_BOUNDARY) { for (buf_end--; buf_end >= buf; buf_end--) { char cur = (char)*buf_end; @@ -272,26 +350,22 @@ const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, } return buf_end; } +#endif - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { // Input isn't aligned, so we need to run one iteration with an // unaligned load, then skip buf backward to the next aligned address. // There's some small overlap here, but we don't mind scanning it twice // if we can do it quickly, do we? - if (nocase) { - const u8 *ptr = - rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0); - if (ptr) { - return ptr; - } - } else { - const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 0); - if (ptr) { - return ptr; - } + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 0) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 0); + + if (ptr) { + return ptr; } buf_end -= min; @@ -322,7 +396,20 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 1) + : rvermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else if (buf_end - buf < VERM_BOUNDARY) { for (buf_end--; buf_end >= buf; buf_end--) { char cur = (char)*buf_end; @@ -335,26 +422,22 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, } return buf_end; } +#endif - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { // Input isn't aligned, so we need to run one iteration with an // unaligned load, then skip buf backward to the next aligned address. // There's some small overlap here, but we don't mind scanning it twice // if we can do it quickly, do we? - if (nocase) { - const u8 *ptr = - rvermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1); - if (ptr) { - return ptr; - } - } else { - const u8 *ptr = rvermUnalign(chars, buf_end - VERM_BOUNDARY, 1); - if (ptr) { - return ptr; - } + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 1) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 1); + + if (ptr) { + return ptr; } buf_end -= min; @@ -383,24 +466,36 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); assert(buf < buf_end); - assert((buf_end - buf) >= VERM_BOUNDARY); - size_t min = (size_t)buf_end % VERM_BOUNDARY; VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rdvermMiniNocase(chars1, chars2, buf, buf_end) + : rdvermMini(chars1, chars2, buf, buf_end); + + if (ptr) { + return ptr; + } + + // check for partial match at end ??? + return buf - 1; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + size_t min = (size_t)buf_end % VERM_BOUNDARY; if (min) { // input not aligned, so we need to run one iteration with an unaligned // load, then skip buf forward to the next aligned address. There's // some small overlap here, but we don't mind scanning it twice if we // can do it quickly, do we? - const u8 *ptr; - if (nocase) { - ptr = rdvermPreconditionNocase(chars1, chars2, - buf_end - VERM_BOUNDARY); - } else { - ptr = rdvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); - } + const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, + buf_end - VERM_BOUNDARY) + : rdvermPrecondition(chars1, chars2, + buf_end - VERM_BOUNDARY); if (ptr) { return ptr; diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 0749470f..3307486c 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -32,6 +32,8 @@ * (users should include vermicelli.h) */ +#if !defined(HAVE_AVX512) + #define VERM_BOUNDARY 16 #define VERM_TYPE m128 #define VERM_SET_FN set16x8 @@ -391,3 +393,497 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { return NULL; } + +#else // HAVE_AVX512 + +#define VERM_BOUNDARY 64 +#define VERM_TYPE m512 +#define VERM_SET_FN set64x8 + +static really_inline +const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars, data); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars, v); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf % 64 == 0); + for (; buf + 63 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + return NULL; +} + +static really_inline +const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 63 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) { + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + if (buf[63] == c1 && buf[64] == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2, + m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1, + u8 m2, const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 64 == 0); + + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + m512 v = and512(casemask, data); + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2, + m512 mask1, m512 mask2, const u8 *buf) { + m512 data = loadu512(buf); // unaligned + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *lastMatchOffset(const u8 *buf_end, u64a z) { + assert(z); + return buf_end - 64 + 63 - clz64(z); +} + +static really_inline +const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars, data); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars, v); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf_end % 64 == 0); + for (; buf + 63 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +static really_inline +const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf_end % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 63 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) { + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 64 == 0); + + for (; buf + 64 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + if (buf_end[-65] == c1 && buf_end[-64] == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +static really_inline +const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 64 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + if ((buf_end[-65] & CASE_CLEAR) == c1 + && (buf_end[-64] & CASE_CLEAR) == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { + m512 data = loadu512(buf); + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + + // no fixup of the boundary required - the aligned run will pick it up + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { + // due to laziness, nonalphas and nocase having interesting behaviour + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + // no fixup of the boundary required - the aligned run will pick it up + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + + return NULL; +} + +#endif // HAVE_AVX512