mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-20 02:47:11 +03:00
borrow cache prefetching tricks from the Marvell port, seem to improve performance by 5-28%
This commit is contained in:
@@ -109,7 +109,8 @@ DUMP_MSK(128)
|
||||
#endif
|
||||
|
||||
#define GET_LO_4(chars) and128(chars, low4bits)
|
||||
#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
|
||||
#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits)
|
||||
//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
|
||||
|
||||
static really_inline
|
||||
u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
|
||||
@@ -177,6 +178,10 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
// Reroll FTW.
|
||||
|
||||
const u8 *last_block = buf_end - 16;
|
||||
|
||||
for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
|
||||
__builtin_prefetch(itPtr);
|
||||
}
|
||||
while (buf < last_block) {
|
||||
m128 lchars = load128(buf);
|
||||
rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
|
||||
|
||||
Reference in New Issue
Block a user