From 9643bb4636145c26d30c37d6e046d2bd3776d7f6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 19 Feb 2024 13:09:02 +0200 Subject: [PATCH] WIP: rework fdr to use fewer instructions, gives about 10% performance increase on SSE/AVX2 --- src/fdr/fdr.c | 126 ++++++++++++++++++++++-------- src/util/arch/arm/simd_utils.h | 10 +++ src/util/arch/common/simd_utils.h | 8 ++ src/util/arch/x86/simd_utils.h | 33 ++++++++ 4 files changed, 143 insertions(+), 34 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index d67e2719..62a08e4e 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2024, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -103,6 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, return s; } +#include "../print_simd.h" static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, @@ -111,41 +113,97 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a it_hi = *(const u64a *)itPtr; - u64a it_lo = *(const u64a *)(itPtr + 8); - u64a reach0 = domain_mask & it_hi; - u64a reach1 = domain_mask & (it_hi >> 8); - u64a reach2 = domain_mask & (it_hi >> 16); - u64a reach3 = domain_mask & (it_hi >> 24); - u64a reach4 = domain_mask & (it_hi >> 32); - u64a reach5 = domain_mask & (it_hi >> 40); - u64a reach6 = domain_mask & (it_hi >> 48); - u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); - u64a reach8 = domain_mask & it_lo; - u64a reach9 = domain_mask & (it_lo >> 8); - u64a reach10 = domain_mask & (it_lo >> 16); - u64a reach11 = domain_mask & (it_lo >> 24); - u64a reach12 = domain_mask & (it_lo >> 32); - u64a reach13 = domain_mask & (it_lo >> 40); - u64a reach14 = domain_mask & (it_lo >> 48); - u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); + // u64a ALIGN_ATTR(16) reach[16]; + u32 ALIGN_ATTR(16) reach[16]; - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); - m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); - m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); - m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4); - m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); - m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); - m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1); - m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); - m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); - m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); - m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); - m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); - m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); + m128 domain_mask_v = set1_4x32(domain_mask); + // m256 ft_v = set1_4x64((ptrdiff_t)ft); + + m128 it_v = loadu128(itPtr); + m128 it_shifted8_v = rshiftbyte_m128(it_v, 1); + m128 it_shifted16_v = rshiftbyte_m128(it_v, 2); + m128 it_shifted24_v = rshiftbyte_m128(it_v, 3); + it_shifted24_v = insert32_m128(it_shifted24_v, unaligned_load_u32(itPtr + 15), 3); + + m128 reach_v[4]; + // m256 reach64_v[4]; + + reach_v[0] = and128(domain_mask_v, it_v); + reach_v[1] = and128(domain_mask_v, it_shifted8_v); + reach_v[2] = and128(domain_mask_v, it_shifted16_v); + reach_v[3] = and128(domain_mask_v, it_shifted24_v); + + // reach_v[0] = lshift32_m128(reach_v[0], 3); + // reach_v[1] = lshift32_m128(reach_v[1], 3); + // reach_v[2] = lshift32_m128(reach_v[2], 3); + // reach_v[3] = lshift32_m128(reach_v[3], 3); + + // reach64_v[0] = widen128(reach_v[0]); + // reach64_v[1] = widen128(reach_v[1]); + // reach64_v[2] = widen128(reach_v[2]); + // reach64_v[3] = widen128(reach_v[3]); + + // reach64_v[0] = add256(reach64_v[0], ft_v); + // reach64_v[1] = add256(reach64_v[1], ft_v); + // reach64_v[2] = add256(reach64_v[2], ft_v); + // reach64_v[3] = add256(reach64_v[3], ft_v); + + // store256(&reach[0], reach64_v[0]); + // store256(&reach[4], reach64_v[1]); + // store256(&reach[8], reach64_v[2]); + // store256(&reach[12], reach64_v[3]); + store128(&reach[0], reach_v[0]); + store128(&reach[4], reach_v[1]); + store128(&reach[8], reach_v[2]); + store128(&reach[12], reach_v[3]); + + m128 st0 = load_m128_from_u64a(ft + reach[0]); + m128 st4 = load_m128_from_u64a(ft + reach[1]); + m128 st8 = load_m128_from_u64a(ft + reach[2]); + m128 st12 = load_m128_from_u64a(ft + reach[3]); + m128 st1 = load_m128_from_u64a(ft + reach[4]); + m128 st5 = load_m128_from_u64a(ft + reach[5]); + m128 st9 = load_m128_from_u64a(ft + reach[6]); + m128 st13 = load_m128_from_u64a(ft + reach[7]); + m128 st2 = load_m128_from_u64a(ft + reach[8]); + m128 st6 = load_m128_from_u64a(ft + reach[9]); + m128 st10 = load_m128_from_u64a(ft + reach[10]); + m128 st14 = load_m128_from_u64a(ft + reach[11]); + m128 st3 = load_m128_from_u64a(ft + reach[12]); + m128 st7 = load_m128_from_u64a(ft + reach[13]); + m128 st11 = load_m128_from_u64a(ft + reach[14]); + m128 st15 = load_m128_from_u64a(ft + reach[15]); + // m128 st0 = load_m128_from_u64a((u64a *)reach[0]); + // m128 st4 = load_m128_from_u64a((u64a *)reach[1]); + // m128 st8 = load_m128_from_u64a((u64a *)reach[2]); + // m128 st12 = load_m128_from_u64a((u64a *)reach[3]); + // m128 st1 = load_m128_from_u64a((u64a *)reach[4]); + // m128 st5 = load_m128_from_u64a((u64a *)reach[5]); + // m128 st9 = load_m128_from_u64a((u64a *)reach[6]); + // m128 st13 = load_m128_from_u64a((u64a *)reach[7]); + // m128 st2 = load_m128_from_u64a((u64a *)reach[8]); + // m128 st6 = load_m128_from_u64a((u64a *)reach[9]); + // m128 st10 = load_m128_from_u64a((u64a *)reach[10]); + // m128 st14 = load_m128_from_u64a((u64a *)reach[11]); + // m128 st3 = load_m128_from_u64a((u64a *)reach[12]); + // m128 st7 = load_m128_from_u64a((u64a *)reach[13]); + // m128 st11 = load_m128_from_u64a((u64a *)reach[14]); + // m128 st15 = load_m128_from_u64a((u64a *)reach[15]); + + st1 = lshiftbyte_m128(st1, 1); + st2 = lshiftbyte_m128(st2, 2); + st3 = lshiftbyte_m128(st3, 3); + st4 = lshiftbyte_m128(st4, 4); + st5 = lshiftbyte_m128(st5, 5); + st6 = lshiftbyte_m128(st6, 6); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st10 = lshiftbyte_m128(st10, 2); + st11 = lshiftbyte_m128(st11, 3); + st12 = lshiftbyte_m128(st12, 4); + st13 = lshiftbyte_m128(st13, 5); + st14 = lshiftbyte_m128(st14, 6); + st15 = lshiftbyte_m128(st15, 7); st0 = or128(st0, st1); st2 = or128(st2, st3); diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 7f8539b0..858866d7 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -181,6 +181,10 @@ static really_inline m128 set1_2x64(u64a c) { return (m128) vdupq_n_u64(c); } +static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) { + return vsetq_lane_u32((uint32x4_t)in, val, imm); +} + static really_inline u32 movd(const m128 in) { return vgetq_lane_u32((uint32x4_t) in, 0); } @@ -195,6 +199,12 @@ m128 load_m128_from_u64a(const u64a *p) { return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); } +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); +} + static really_inline u32 extract32from128(const m128 in, unsigned imm) { #if defined(HAVE__BUILTIN_CONSTANT_P) if (__builtin_constant_p(imm)) { diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 24331b10..4ac92ab3 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -384,6 +384,14 @@ m256 pshufb_m256(m256 a, m256 b) { return rv; } +static really_inline +m256 widen128(m128 x) { + m256 rv; + rv.lo = widenlo128(x); + rv.hi = widenhi128(x); + return rv; +} + #endif // HAVE_SIMD_256_BITS /**** diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 49797aba..9c2984c0 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -122,6 +122,17 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) _mm_sub_epi64(a, b); } +static really_really_inline +m128 lshift32_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi32(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi32(a, x); +} + static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { #if defined(HAVE__BUILTIN_CONSTANT_P) @@ -156,6 +167,10 @@ static really_inline m128 set1_2x64(u64a c) { return _mm_set1_epi64x(c); } +static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) { + return _mm_insert_epi32(in, val, imm); +} + static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } @@ -451,6 +466,18 @@ m128 set2x64(u64a hi, u64a lo) { return _mm_set_epi64x(hi, lo); } +#include "../print_simd.h" + +static really_inline +m128 widenlo128(m128 x) { + return _mm_unpacklo_epi32(x, zeroes128()); +} + +static really_inline +m128 widenhi128(m128 x) { + return _mm_unpackhi_epi32(x, zeroes128()); +} + /**** **** 256-bit Primitives ****/ @@ -677,6 +704,12 @@ m256 combine2x128(m128 hi, m128 lo) { return insert128to256(cast128to256(lo), hi, 1); #endif } + +static really_inline +m256 widen128(m128 x) { + return (m256) _mm256_cvtepu32_epi64(x); +} + #endif //AVX2 /****