diff --git a/CMakeLists.txt b/CMakeLists.txt index 7077dc42..e112ca83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -664,7 +664,6 @@ set (hs_exec_SRCS src/util/scatter.h src/util/scatter_runtime.h src/util/simd_utils.h - src/util/simd_utils.c src/util/state_compress.h src/util/state_compress.c src/util/unaligned.h diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 917a6ad4..a2f79774 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -279,7 +279,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } - #define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; static really_really_inline diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index e74f25d1..24c1abe0 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -41,6 +41,23 @@ #include // for memcpy +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; static really_inline m128 ones128(void) { #if defined(__GNUC__) || defined(__INTEL_COMPILER) /* gcc gets this right */ @@ -236,14 +253,14 @@ m128 loadbytes128(const void *ptr, unsigned int n) { memcpy(&a, ptr, n); return a; } - +/* #ifdef __cplusplus extern "C" { #endif extern const u8 simd_onebit_masks[]; #ifdef __cplusplus } -#endif +#endif*/ static really_inline m128 mask1bit128(unsigned int n) { @@ -277,19 +294,68 @@ char testbit128(m128 val, unsigned int n) { } // offset must be an immediate -#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) +#define palignr_imm(r, l, offset) _mm_alignr_epi8(r, l, offset) static really_inline m128 pshufb_m128(m128 a, m128 b) { return _mm_shuffle_epi8(a, b); } +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return palignr_imm((m128)(a), (m128)(b), (offset)); break; + +static really_really_inline +m128 palignr_sw(m128 r, m128 l, int offset) { + switch (offset) { + case 0: return l; break; + CASE_ALIGN_VECTORS(r, l, 1); + CASE_ALIGN_VECTORS(r, l, 2); + CASE_ALIGN_VECTORS(r, l, 3); + CASE_ALIGN_VECTORS(r, l, 4); + CASE_ALIGN_VECTORS(r, l, 5); + CASE_ALIGN_VECTORS(r, l, 6); + CASE_ALIGN_VECTORS(r, l, 7); + CASE_ALIGN_VECTORS(r, l, 8); + CASE_ALIGN_VECTORS(r, l, 9); + CASE_ALIGN_VECTORS(r, l, 10); + CASE_ALIGN_VECTORS(r, l, 11); + CASE_ALIGN_VECTORS(r, l, 12); + CASE_ALIGN_VECTORS(r, l, 13); + CASE_ALIGN_VECTORS(r, l, 14); + CASE_ALIGN_VECTORS(r, l, 15); + case 16: return r; break; + default: + return zeroes128(); + break; + } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return palignr_imm(r, l, offset); + } +#endif + return palignr_sw(r, l, offset); +} +#undef CASE_ALIGN_VECTORS + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + if (amount < 0) { + return palignr(zeroes128(), in, -amount); + } else { + return palignr(in, zeroes128(), 16 - amount); + } +} +/* static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); return pshufb_m128(in, shift_mask); -} +}*/ static really_inline m128 max_u8_m128(m128 a, m128 b) { diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c deleted file mode 100644 index 25a81412..00000000 --- a/src/util/simd_utils.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Lookup tables to support SIMD operations. - */ - -#include "simd_utils.h" - -ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = { - 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, - 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, - - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - - 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, - 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, -}; - -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -};