/* * Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2020-2021, VectorCamp PC * Copyright (c) 2021, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Mask-based state compression, used by the NFA. */ #include "config.h" #include "ue2common.h" #include "arch.h" #include "bitutils.h" #include "unaligned.h" #include "pack_bits.h" #include "partial_store.h" #include "popcount.h" #include "state_compress.h" #include /* * 32-bit store/load. */ void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) { assert(popcount32(*m) <= bytes * 8); u32 v = compress32(*x, *m); partial_store_u32(ptr, v, bytes); } void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) { assert(popcount32(*m) <= bytes * 8); u32 v = partial_load_u32(ptr, bytes); *x = expand32(v, *m); } /* * 64-bit store/load. */ void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) { assert(popcount64(*m) <= bytes * 8); u64a v = compress64(*x, *m); partial_store_u64a(ptr, v, bytes); } void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) { assert(popcount64(*m) <= bytes * 8); #ifdef HAVE_SVE2_BITPERM svbool_t pg = svwhilelt_b8(0U, bytes); svuint64_t expanded = svbdep(svreinterpret_u64(svld1_u8(pg, ptr)), *m); svst1(svptrue_pat_b64(SV_VL1), (uint64_t *)x, expanded); #else u64a v = partial_load_u64a(ptr, bytes); *x = expand64(v, *m); #endif } /* * 128-bit store/load. */ #if defined(ARCH_32_BIT) static really_inline void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { // First, decompose our vectors into 32-bit chunks. u32 x[4]; memcpy(x, &xvec, sizeof(xvec)); u32 m[4]; memcpy(m, &mvec, sizeof(mvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]) }; // Compress each 32-bit chunk individually. u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]), compress32(x[2], m[2]), compress32(x[3], m[3]) }; // Write packed data out. pack_bits_32(ptr, v, bits, 4); } #endif #if defined(ARCH_64_BIT) static really_inline void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(16) x[2]; u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); store128(x, xvec); // Count the number of bits of compressed state we're writing out per // chunk. u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; // Compress each 64-bit chunk individually. xvec = compress128(xvec, mvec); store128(x, xvec); // Write packed data out. pack_bits_64(ptr, x, bits, 2); } #endif void storecompressed128(void *ptr, const m128 *x, const m128 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) storecompressed128_64bit(ptr, *x, *m); #else storecompressed128_32bit(ptr, *x, *m); #endif } #if defined(ARCH_32_BIT) static really_inline m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 32-bit chunks. u32 m[8]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]) }; u32 v[4]; unpack_bits_32(v, (const u8 *)ptr, bits, 4); u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]) }; return set4x32(x[3], x[2], x[1], x[0]); } #endif #if defined(ARCH_64_BIT) static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); // Count the number of bits of compressed state we're writing out per // chunk. u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u64a ALIGN_ATTR(16) v[2]; unpack_bits_64(v, (const u8 *)ptr, bits, 2); #ifdef HAVE_SVE2_BITPERM u64a ALIGN_ATTR(16) xvec[2]; bdep64x2(xvec, v, &mvec); return load128(xvec); #else return expand128(load128(v), mvec); #endif } #endif void loadcompressed128(m128 *x, const void *ptr, const m128 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) *x = loadcompressed128_64bit(ptr, *m); #else *x = loadcompressed128_32bit(ptr, *m); #endif } /* * 256-bit store/load. */ #if defined(ARCH_32_BIT) static really_inline void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) { // First, decompose our vectors into 32-bit chunks. u32 x[8]; memcpy(x, &xvec, sizeof(xvec)); u32 m[8]; memcpy(m, &mvec, sizeof(mvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7])}; // Compress each 32-bit chunk individually. u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]), compress32(x[2], m[2]), compress32(x[3], m[3]), compress32(x[4], m[4]), compress32(x[5], m[5]), compress32(x[6], m[6]), compress32(x[7], m[7]) }; // Write packed data out. pack_bits_32(ptr, v, bits, 8); } #endif #if defined(ARCH_64_BIT) static really_really_inline void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) { // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(32) x[4]; u64a ALIGN_ATTR(32) m[4]; store256(x, xvec); store256(m, mvec); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]) }; // Compress each 64-bit chunk individually. u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]), compress64(x[2], m[2]), compress64(x[3], m[3]) }; // Write packed data out. pack_bits_64(ptr, v, bits, 4); } #endif void storecompressed256(void *ptr, const m256 *x, const m256 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) storecompressed256_64bit(ptr, *x, *m); #else storecompressed256_32bit(ptr, *x, *m); #endif } #if defined(ARCH_32_BIT) static really_inline m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { // First, decompose our vectors into 32-bit chunks. u32 m[8]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7])}; u32 v[8]; unpack_bits_32(v, (const u8 *)ptr, bits, 8); u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]), expand32(v[4], m[4]), expand32(v[5], m[5]), expand32(v[6], m[6]), expand32(v[7], m[7]) }; #if !defined(HAVE_AVX2) m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]), .hi = set4x32(x[7], x[6], x[5], x[4]) }; #else m256 xvec = set8x32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #endif return xvec; } #endif #if defined(ARCH_64_BIT) static really_inline m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { // First, decompose our vectors into 64-bit chunks. u64a m[4]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]) }; u64a v[4]; unpack_bits_64(v, (const u8 *)ptr, bits, 4); #ifdef HAVE_SVE2_BITPERM u64a ALIGN_ATTR(16) x[4]; bdep64x2(x, v, &mvec.lo); bdep64x2(&x[2], &v[2], &mvec.hi); #else u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]) }; #endif #if !defined(HAVE_AVX2) m256 xvec = { .lo = set2x64(x[1], x[0]), .hi = set2x64(x[3], x[2]) }; #else m256 xvec = set4x64(x[3], x[2], x[1], x[0]); #endif return xvec; } #endif void loadcompressed256(m256 *x, const void *ptr, const m256 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) *x = loadcompressed256_64bit(ptr, *m); #else *x = loadcompressed256_32bit(ptr, *m); #endif } /* * 384-bit store/load. */ #if defined(ARCH_32_BIT) static really_inline void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) { // First, decompose our vectors into 32-bit chunks. u32 x[12]; memcpy(x, &xvec, sizeof(xvec)); u32 m[12]; memcpy(m, &mvec, sizeof(mvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7]), popcount32(m[8]), popcount32(m[9]), popcount32(m[10]), popcount32(m[11]) }; // Compress each 32-bit chunk individually. u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]), compress32(x[2], m[2]), compress32(x[3], m[3]), compress32(x[4], m[4]), compress32(x[5], m[5]), compress32(x[6], m[6]), compress32(x[7], m[7]), compress32(x[8], m[8]), compress32(x[9], m[9]), compress32(x[10], m[10]), compress32(x[11], m[11])}; // Write packed data out. pack_bits_32(ptr, v, bits, 12); } #endif #if defined(ARCH_64_BIT) static really_inline void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) { // First, decompose our vectors into 64-bit chunks. u64a x[6]; memcpy(x, &xvec, sizeof(xvec)); u64a m[6]; memcpy(m, &mvec, sizeof(mvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]), popcount64(m[4]), popcount64(m[5]) }; // Compress each 64-bit chunk individually. u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]), compress64(x[2], m[2]), compress64(x[3], m[3]), compress64(x[4], m[4]), compress64(x[5], m[5]) }; // Write packed data out. pack_bits_64(ptr, v, bits, 6); } #endif void storecompressed384(void *ptr, const m384 *x, const m384 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) storecompressed384_64bit(ptr, *x, *m); #else storecompressed384_32bit(ptr, *x, *m); #endif } #if defined(ARCH_32_BIT) static really_inline m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { // First, decompose our vectors into 32-bit chunks. u32 m[12]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7]), popcount32(m[8]), popcount32(m[9]), popcount32(m[10]), popcount32(m[11]) }; u32 v[12]; unpack_bits_32(v, (const u8 *)ptr, bits, 12); u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]), expand32(v[4], m[4]), expand32(v[5], m[5]), expand32(v[6], m[6]), expand32(v[7], m[7]), expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]) }; m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]), .mid = set4x32(x[7], x[6], x[5], x[4]), .hi = set4x32(x[11], x[10], x[9], x[8]) }; return xvec; } #endif #if defined(ARCH_64_BIT) static really_inline m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { // First, decompose our vectors into 64-bit chunks. u64a m[6]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]), popcount64(m[4]), popcount64(m[5]) }; u64a v[6]; unpack_bits_64(v, (const u8 *)ptr, bits, 6); #ifdef HAVE_SVE2_BITPERM u64a ALIGN_ATTR(16) x[6]; bdep64x2(x, v, &mvec.lo); bdep64x2(&x[2], &v[2], &mvec.mid); bdep64x2(&x[4], &v[4], &mvec.hi); #else u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]) }; #endif m384 xvec = { .lo = set2x64(x[1], x[0]), .mid = set2x64(x[3], x[2]), .hi = set2x64(x[5], x[4]) }; return xvec; } #endif void loadcompressed384(m384 *x, const void *ptr, const m384 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) *x = loadcompressed384_64bit(ptr, *m); #else *x = loadcompressed384_32bit(ptr, *m); #endif } /* * 512-bit store/load. */ #if defined(ARCH_32_BIT) static really_inline void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) { // First, decompose our vectors into 32-bit chunks. u32 x[16]; memcpy(x, &xvec, sizeof(xvec)); u32 m[16]; memcpy(m, &mvec, sizeof(mvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7]), popcount32(m[8]), popcount32(m[9]), popcount32(m[10]), popcount32(m[11]), popcount32(m[12]), popcount32(m[13]), popcount32(m[14]), popcount32(m[15])}; // Compress each 32-bit chunk individually. u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]), compress32(x[2], m[2]), compress32(x[3], m[3]), compress32(x[4], m[4]), compress32(x[5], m[5]), compress32(x[6], m[6]), compress32(x[7], m[7]), compress32(x[8], m[8]), compress32(x[9], m[9]), compress32(x[10], m[10]), compress32(x[11], m[11]), compress32(x[12], m[12]), compress32(x[13], m[13]), compress32(x[14], m[14]), compress32(x[15], m[15]) }; // Write packed data out. pack_bits_32(ptr, v, bits, 16); } #endif #if defined(ARCH_64_BIT) static really_inline void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) { // First, decompose our vectors into 64-bit chunks. u64a m[8]; memcpy(m, &mvec, sizeof(mvec)); u64a x[8]; memcpy(x, &xvec, sizeof(xvec)); // Count the number of bits of compressed state we're writing out per // chunk. u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]), popcount64(m[4]), popcount64(m[5]), popcount64(m[6]), popcount64(m[7]) }; // Compress each 64-bit chunk individually. u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]), compress64(x[2], m[2]), compress64(x[3], m[3]), compress64(x[4], m[4]), compress64(x[5], m[5]), compress64(x[6], m[6]), compress64(x[7], m[7]) }; // Write packed data out. pack_bits_64(ptr, v, bits, 8); } #endif void storecompressed512(void *ptr, const m512 *x, const m512 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) storecompressed512_64bit(ptr, *x, *m); #else storecompressed512_32bit(ptr, *x, *m); #endif } #if defined(ARCH_32_BIT) static really_inline m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { // First, decompose our vectors into 32-bit chunks. u32 m[16]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), popcount32(m[2]), popcount32(m[3]), popcount32(m[4]), popcount32(m[5]), popcount32(m[6]), popcount32(m[7]), popcount32(m[8]), popcount32(m[9]), popcount32(m[10]), popcount32(m[11]), popcount32(m[12]), popcount32(m[13]), popcount32(m[14]), popcount32(m[15]) }; u32 v[16]; unpack_bits_32(v, (const u8 *)ptr, bits, 16); u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]), expand32(v[4], m[4]), expand32(v[5], m[5]), expand32(v[6], m[6]), expand32(v[7], m[7]), expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]), expand32(v[12], m[12]), expand32(v[13], m[13]), expand32(v[14], m[14]), expand32(v[15], m[15]) }; m512 xvec; #if defined(HAVE_AVX512) xvec = set32x16(x[15], x[14], x[13], x[12], x[11], x[10], x[9], x[8], x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) xvec.lo = set8x32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); xvec.hi = set8x32(x[15], x[14], x[13], x[12], x[11], x[10], x[9], x[8]); #else xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]); xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]); xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]); xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]); #endif return xvec; } #endif #if defined(ARCH_64_BIT) static really_inline m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { // First, decompose our vectors into 64-bit chunks. u64a m[8]; memcpy(m, &mvec, sizeof(mvec)); u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), popcount64(m[2]), popcount64(m[3]), popcount64(m[4]), popcount64(m[5]), popcount64(m[6]), popcount64(m[7]) }; u64a v[8]; unpack_bits_64(v, (const u8 *)ptr, bits, 8); #ifdef HAVE_SVE2_BITPERM u64a ALIGN_ATTR(16) x[8]; bdep64x2(x, v, &mvec.lo.lo); bdep64x2(&x[2], &v[2], &mvec.lo.hi); bdep64x2(&x[4], &v[4], &mvec.hi.lo); bdep64x2(&x[6], &v[6], &mvec.hi.hi); #else u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]), expand64(v[6], m[6]), expand64(v[7], m[7]) }; #endif #if defined(HAVE_AVX512) m512 xvec = set8x64(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]), .hi = set4x64(x[7], x[6], x[5], x[4])}; #else m512 xvec = { .lo = { set2x64(x[1], x[0]), set2x64(x[3], x[2]) }, .hi = { set2x64(x[5], x[4]), set2x64(x[7], x[6]) } }; #endif return xvec; } #endif void loadcompressed512(m512 *x, const void *ptr, const m512 *m, UNUSED u32 bytes) { #if defined(ARCH_64_BIT) *x = loadcompressed512_64bit(ptr, *m); #else *x = loadcompressed512_32bit(ptr, *m); #endif }