diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 356cc3e6..715ab684 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -147,74 +147,43 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a domain_mask = ~domain_mask_flipped; - u64a ALIGN_ATTR(16) ptr[16]; - ptr[0] = unaligned_load_u32(itPtr + 0); - ptr[1] = unaligned_load_u32(itPtr + 1); - ptr[2] = unaligned_load_u32(itPtr + 2); - ptr[3] = unaligned_load_u32(itPtr + 3); - ptr[4] = unaligned_load_u32(itPtr + 4); - ptr[5] = unaligned_load_u32(itPtr + 5); - ptr[6] = unaligned_load_u32(itPtr + 6); - ptr[7] = unaligned_load_u32(itPtr + 7); - ptr[8] = unaligned_load_u32(itPtr + 8); - ptr[9] = unaligned_load_u32(itPtr + 9); - ptr[10] = unaligned_load_u32(itPtr + 10); - ptr[11] = unaligned_load_u32(itPtr + 11); - ptr[12] = unaligned_load_u32(itPtr + 12); - ptr[13] = unaligned_load_u32(itPtr + 13); - ptr[14] = unaligned_load_u32(itPtr + 14); - ptr[15] = unaligned_load_u32(itPtr + 15); + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach1 = domain_mask & (it_hi >> 8); + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach3 = domain_mask & (it_hi >> 24); + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach5 = domain_mask & (it_hi >> 40); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); + u64a reach8 = domain_mask & it_lo; + u64a reach9 = domain_mask & (it_lo >> 8); + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach11 = domain_mask & (it_lo >> 24); + u64a reach12 = domain_mask & (it_lo >> 32); + u64a reach13 = domain_mask & (it_lo >> 40); + u64a reach14 = domain_mask & (it_lo >> 48); + u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); - u64a mask_not = ~domain_mask_flipped; - u64a reach0 = mask_not & ptr[0]; - u64a reach1 = mask_not & ptr[1]; - u64a reach2 = mask_not & ptr[2]; - u64a reach3 = mask_not & ptr[3]; - u64a reach4 = mask_not & ptr[4]; - u64a reach5 = mask_not & ptr[5]; - u64a reach6 = mask_not & ptr[6]; - u64a reach7 = mask_not & ptr[7]; - u64a reach8 = mask_not & ptr[8]; - u64a reach9 = mask_not & ptr[9]; - u64a reach10 = mask_not & ptr[10]; - u64a reach11 = mask_not & ptr[11]; - u64a reach12 = mask_not & ptr[12]; - u64a reach13 = mask_not & ptr[13]; - u64a reach14 = mask_not & ptr[14]; - u64a reach15 = mask_not & ptr[15]; - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st1 = load_m128_from_u64a(ft + reach1); - m128 st2 = load_m128_from_u64a(ft + reach2); - m128 st3 = load_m128_from_u64a(ft + reach3); - m128 st4 = load_m128_from_u64a(ft + reach4); - m128 st5 = load_m128_from_u64a(ft + reach5); - m128 st6 = load_m128_from_u64a(ft + reach6); - m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); + m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); + m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); + m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4); + m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); + m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); + m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = load_m128_from_u64a(ft + reach9); - m128 st10 = load_m128_from_u64a(ft + reach10); - m128 st11 = load_m128_from_u64a(ft + reach11); - m128 st12 = load_m128_from_u64a(ft + reach12); - m128 st13 = load_m128_from_u64a(ft + reach13); - m128 st14 = load_m128_from_u64a(ft + reach14); - m128 st15 = load_m128_from_u64a(ft + reach15); - - st1 = lshiftbyte_m128(st1, 1); - st2 = lshiftbyte_m128(st2, 2); - st3 = lshiftbyte_m128(st3, 3); - st4 = lshiftbyte_m128(st4, 4); - st5 = lshiftbyte_m128(st5, 5); - st6 = lshiftbyte_m128(st6, 6); - st7 = lshiftbyte_m128(st7, 7); - st9 = lshiftbyte_m128(st9, 1); - st10 = lshiftbyte_m128(st10, 2); - st11 = lshiftbyte_m128(st11, 3); - st12 = lshiftbyte_m128(st12, 4); - st13 = lshiftbyte_m128(st13, 5); - st14 = lshiftbyte_m128(st14, 6); - st15 = lshiftbyte_m128(st15, 7); + m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1); + m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); + m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); + m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); + m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); + m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); + m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); st0 = or128(st0, st1); st2 = or128(st2, st3);