mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
optimize get_conf_stride_1()
This commit is contained in:
parent
f9ef98ce19
commit
c238d627c9
@ -147,74 +147,43 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||
/* +1: the zones ensure that we can read the byte at z->end */
|
||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||
u64a domain_mask = ~domain_mask_flipped;
|
||||
|
||||
u64a ALIGN_ATTR(16) ptr[16];
|
||||
ptr[0] = unaligned_load_u32(itPtr + 0);
|
||||
ptr[1] = unaligned_load_u32(itPtr + 1);
|
||||
ptr[2] = unaligned_load_u32(itPtr + 2);
|
||||
ptr[3] = unaligned_load_u32(itPtr + 3);
|
||||
ptr[4] = unaligned_load_u32(itPtr + 4);
|
||||
ptr[5] = unaligned_load_u32(itPtr + 5);
|
||||
ptr[6] = unaligned_load_u32(itPtr + 6);
|
||||
ptr[7] = unaligned_load_u32(itPtr + 7);
|
||||
ptr[8] = unaligned_load_u32(itPtr + 8);
|
||||
ptr[9] = unaligned_load_u32(itPtr + 9);
|
||||
ptr[10] = unaligned_load_u32(itPtr + 10);
|
||||
ptr[11] = unaligned_load_u32(itPtr + 11);
|
||||
ptr[12] = unaligned_load_u32(itPtr + 12);
|
||||
ptr[13] = unaligned_load_u32(itPtr + 13);
|
||||
ptr[14] = unaligned_load_u32(itPtr + 14);
|
||||
ptr[15] = unaligned_load_u32(itPtr + 15);
|
||||
u64a it_hi = *(const u64a *)itPtr;
|
||||
u64a it_lo = *(const u64a *)(itPtr + 8);
|
||||
u64a reach0 = domain_mask & it_hi;
|
||||
u64a reach1 = domain_mask & (it_hi >> 8);
|
||||
u64a reach2 = domain_mask & (it_hi >> 16);
|
||||
u64a reach3 = domain_mask & (it_hi >> 24);
|
||||
u64a reach4 = domain_mask & (it_hi >> 32);
|
||||
u64a reach5 = domain_mask & (it_hi >> 40);
|
||||
u64a reach6 = domain_mask & (it_hi >> 48);
|
||||
u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
|
||||
u64a reach8 = domain_mask & it_lo;
|
||||
u64a reach9 = domain_mask & (it_lo >> 8);
|
||||
u64a reach10 = domain_mask & (it_lo >> 16);
|
||||
u64a reach11 = domain_mask & (it_lo >> 24);
|
||||
u64a reach12 = domain_mask & (it_lo >> 32);
|
||||
u64a reach13 = domain_mask & (it_lo >> 40);
|
||||
u64a reach14 = domain_mask & (it_lo >> 48);
|
||||
u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
|
||||
|
||||
u64a mask_not = ~domain_mask_flipped;
|
||||
u64a reach0 = mask_not & ptr[0];
|
||||
u64a reach1 = mask_not & ptr[1];
|
||||
u64a reach2 = mask_not & ptr[2];
|
||||
u64a reach3 = mask_not & ptr[3];
|
||||
u64a reach4 = mask_not & ptr[4];
|
||||
u64a reach5 = mask_not & ptr[5];
|
||||
u64a reach6 = mask_not & ptr[6];
|
||||
u64a reach7 = mask_not & ptr[7];
|
||||
u64a reach8 = mask_not & ptr[8];
|
||||
u64a reach9 = mask_not & ptr[9];
|
||||
u64a reach10 = mask_not & ptr[10];
|
||||
u64a reach11 = mask_not & ptr[11];
|
||||
u64a reach12 = mask_not & ptr[12];
|
||||
u64a reach13 = mask_not & ptr[13];
|
||||
u64a reach14 = mask_not & ptr[14];
|
||||
u64a reach15 = mask_not & ptr[15];
|
||||
|
||||
m128 st0 = load_m128_from_u64a(ft + reach0);
|
||||
m128 st1 = load_m128_from_u64a(ft + reach1);
|
||||
m128 st2 = load_m128_from_u64a(ft + reach2);
|
||||
m128 st3 = load_m128_from_u64a(ft + reach3);
|
||||
m128 st4 = load_m128_from_u64a(ft + reach4);
|
||||
m128 st5 = load_m128_from_u64a(ft + reach5);
|
||||
m128 st6 = load_m128_from_u64a(ft + reach6);
|
||||
m128 st7 = load_m128_from_u64a(ft + reach7);
|
||||
m128 st0 = load_m128_from_u64a(ft + reach0);
|
||||
m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
|
||||
m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
|
||||
m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
|
||||
m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
|
||||
m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
|
||||
m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
|
||||
m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
|
||||
m128 st8 = load_m128_from_u64a(ft + reach8);
|
||||
m128 st9 = load_m128_from_u64a(ft + reach9);
|
||||
m128 st10 = load_m128_from_u64a(ft + reach10);
|
||||
m128 st11 = load_m128_from_u64a(ft + reach11);
|
||||
m128 st12 = load_m128_from_u64a(ft + reach12);
|
||||
m128 st13 = load_m128_from_u64a(ft + reach13);
|
||||
m128 st14 = load_m128_from_u64a(ft + reach14);
|
||||
m128 st15 = load_m128_from_u64a(ft + reach15);
|
||||
|
||||
st1 = lshiftbyte_m128(st1, 1);
|
||||
st2 = lshiftbyte_m128(st2, 2);
|
||||
st3 = lshiftbyte_m128(st3, 3);
|
||||
st4 = lshiftbyte_m128(st4, 4);
|
||||
st5 = lshiftbyte_m128(st5, 5);
|
||||
st6 = lshiftbyte_m128(st6, 6);
|
||||
st7 = lshiftbyte_m128(st7, 7);
|
||||
st9 = lshiftbyte_m128(st9, 1);
|
||||
st10 = lshiftbyte_m128(st10, 2);
|
||||
st11 = lshiftbyte_m128(st11, 3);
|
||||
st12 = lshiftbyte_m128(st12, 4);
|
||||
st13 = lshiftbyte_m128(st13, 5);
|
||||
st14 = lshiftbyte_m128(st14, 6);
|
||||
st15 = lshiftbyte_m128(st15, 7);
|
||||
m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
|
||||
m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
|
||||
m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
|
||||
m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
|
||||
m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
|
||||
m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
|
||||
m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
|
||||
|
||||
st0 = or128(st0, st1);
|
||||
st2 = or128(st2, st3);
|
||||
|
Loading…
x
Reference in New Issue
Block a user