/*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2021, VectorCamp PC
 * Copyright (c) 2021, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Mask-based state compression, used by the NFA.
 */
#include "config.h"
#include "ue2common.h"
#include "arch.h"
#include "bitutils.h"
#include "unaligned.h"
#include "pack_bits.h"
#include "partial_store.h"
#include "popcount.h"
#include "state_compress.h"

#include <string.h>

/*
 * 32-bit store/load.
 */

void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) {
    assert(popcount32(*m) <= bytes * 8);

    u32 v = compress32(*x, *m);
    partial_store_u32(ptr, v, bytes);
}

void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) {
    assert(popcount32(*m) <= bytes * 8);

    u32 v = partial_load_u32(ptr, bytes);
    *x = expand32(v, *m);
}

/*
 * 64-bit store/load.
 */

void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) {
    assert(popcount64(*m) <= bytes * 8);

    u64a v = compress64(*x, *m);
    partial_store_u64a(ptr, v, bytes);
}

void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) {
    assert(popcount64(*m) <= bytes * 8);
#ifdef HAVE_SVE2_BITPERM
    svbool_t pg = svwhilelt_b8(0U, bytes);
    svuint64_t expanded = svbdep(svreinterpret_u64(svld1_u8(pg, ptr)), *m);
    svst1(svptrue_pat_b64(SV_VL1), (uint64_t *)x, expanded);
#else
    u64a v = partial_load_u64a(ptr, bytes);
    *x = expand64(v, *m);
#endif
}

/*
 * 128-bit store/load.
 */

#if defined(ARCH_32_BIT)
static really_inline
void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 x[4];
    memcpy(x, &xvec, sizeof(xvec));
    u32 m[4];
    memcpy(m, &mvec, sizeof(mvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
                    popcount32(m[2]), popcount32(m[3]) };

    // Compress each 32-bit chunk individually.
    u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
                 compress32(x[2], m[2]), compress32(x[3], m[3]) };

    // Write packed data out.
    pack_bits_32(ptr, v, bits, 4);
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a ALIGN_ATTR(16) x[2];
    u64a ALIGN_ATTR(16) m[2];
    store128(m, mvec);
    store128(x, xvec);

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };

    // Compress each 64-bit chunk individually.
    xvec = compress128(xvec, mvec);
    store128(x, xvec);

    // Write packed data out.
    pack_bits_64(ptr, x, bits, 2);
}
#endif

void storecompressed128(void *ptr, const m128 *x, const m128 *m,
                        UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    storecompressed128_64bit(ptr, *x, *m);
#else
    storecompressed128_32bit(ptr, *x, *m);
#endif
}

#if defined(ARCH_32_BIT)
static really_inline
m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 m[8];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
                    popcount32(m[2]), popcount32(m[3]) };
    u32 v[4];

    unpack_bits_32(v, (const u8 *)ptr, bits, 4);

    u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                 expand32(v[2], m[2]), expand32(v[3], m[3]) };

    return set4x32(x[3], x[2], x[1], x[0]);
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a ALIGN_ATTR(16) m[2];
    store128(m, mvec);

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };

    u64a ALIGN_ATTR(16) v[2];
    unpack_bits_64(v, (const u8 *)ptr, bits, 2);

#ifdef HAVE_SVE2_BITPERM
    u64a ALIGN_ATTR(16) xvec[2];
    bdep64x2(xvec, v, &mvec);
    return load128(xvec);
#else
    return expand128(load128(v), mvec);
#endif
}
#endif

void loadcompressed128(m128 *x, const void *ptr, const m128 *m,
                       UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    *x = loadcompressed128_64bit(ptr, *m);
#else
    *x = loadcompressed128_32bit(ptr, *m);
#endif
}

/*
 * 256-bit store/load.
 */

#if defined(ARCH_32_BIT)
static really_inline
void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 x[8];
    memcpy(x, &xvec, sizeof(xvec));
    u32 m[8];
    memcpy(m, &mvec, sizeof(mvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
                    popcount32(m[2]), popcount32(m[3]),
                    popcount32(m[4]), popcount32(m[5]),
                    popcount32(m[6]), popcount32(m[7])};

    // Compress each 32-bit chunk individually.
    u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
                 compress32(x[2], m[2]), compress32(x[3], m[3]),
                 compress32(x[4], m[4]), compress32(x[5], m[5]),
                 compress32(x[6], m[6]), compress32(x[7], m[7]) };

    // Write packed data out.
    pack_bits_32(ptr, v, bits, 8);
}
#endif

#if defined(ARCH_64_BIT)
static really_really_inline
void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a ALIGN_ATTR(32) x[4];
    u64a ALIGN_ATTR(32) m[4];
    store256(x, xvec);
    store256(m, mvec);

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]) };

    // Compress each 64-bit chunk individually.
    u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
                  compress64(x[2], m[2]), compress64(x[3], m[3]) };

    // Write packed data out.
    pack_bits_64(ptr, v, bits, 4);
}
#endif

void storecompressed256(void *ptr, const m256 *x, const m256 *m,
                        UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    storecompressed256_64bit(ptr, *x, *m);
#else
    storecompressed256_32bit(ptr, *x, *m);
#endif
}

#if defined(ARCH_32_BIT)
static really_inline
m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 m[8];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
                    popcount32(m[2]), popcount32(m[3]),
                    popcount32(m[4]), popcount32(m[5]),
                    popcount32(m[6]), popcount32(m[7])};
    u32 v[8];

    unpack_bits_32(v, (const u8 *)ptr, bits, 8);

    u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                 expand32(v[2], m[2]), expand32(v[3], m[3]),
                 expand32(v[4], m[4]), expand32(v[5], m[5]),
                 expand32(v[6], m[6]), expand32(v[7], m[7]) };

#if !defined(HAVE_AVX2)
    m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
                  .hi = set4x32(x[7], x[6], x[5], x[4]) };
#else
    m256 xvec = set8x32(x[7], x[6], x[5], x[4],
                        x[3], x[2], x[1], x[0]);
#endif
    return xvec;
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a m[4];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]) };
    u64a v[4];

    unpack_bits_64(v, (const u8 *)ptr, bits, 4);

#ifdef HAVE_SVE2_BITPERM
    u64a ALIGN_ATTR(16) x[4];
    bdep64x2(x, v, &mvec.lo);
    bdep64x2(&x[2], &v[2], &mvec.hi);
#else
    u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                  expand64(v[2], m[2]), expand64(v[3], m[3]) };
#endif

#if !defined(HAVE_AVX2)
    m256 xvec = { .lo = set2x64(x[1], x[0]),
                  .hi = set2x64(x[3], x[2]) };
#else
    m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
#endif
    return xvec;
}
#endif

void loadcompressed256(m256 *x, const void *ptr, const m256 *m,
                       UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    *x = loadcompressed256_64bit(ptr, *m);
#else
    *x = loadcompressed256_32bit(ptr, *m);
#endif
}

/*
 * 384-bit store/load.
 */

#if defined(ARCH_32_BIT)
static really_inline
void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 x[12];
    memcpy(x, &xvec, sizeof(xvec));
    u32 m[12];
    memcpy(m, &mvec, sizeof(mvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
                     popcount32(m[2]), popcount32(m[3]),
                     popcount32(m[4]), popcount32(m[5]),
                     popcount32(m[6]), popcount32(m[7]),
                     popcount32(m[8]), popcount32(m[9]),
                     popcount32(m[10]), popcount32(m[11]) };

    // Compress each 32-bit chunk individually.
    u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
                  compress32(x[2], m[2]), compress32(x[3], m[3]),
                  compress32(x[4], m[4]), compress32(x[5], m[5]),
                  compress32(x[6], m[6]), compress32(x[7], m[7]),
                  compress32(x[8], m[8]), compress32(x[9], m[9]),
                  compress32(x[10], m[10]), compress32(x[11], m[11])};

    // Write packed data out.
    pack_bits_32(ptr, v, bits, 12);
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a x[6];
    memcpy(x, &xvec, sizeof(xvec));
    u64a m[6];
    memcpy(m, &mvec, sizeof(mvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]),
                    popcount64(m[4]), popcount64(m[5]) };

    // Compress each 64-bit chunk individually.
    u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
                  compress64(x[2], m[2]), compress64(x[3], m[3]),
                  compress64(x[4], m[4]), compress64(x[5], m[5]) };

    // Write packed data out.
    pack_bits_64(ptr, v, bits, 6);
}
#endif

void storecompressed384(void *ptr, const m384 *x, const m384 *m,
                        UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    storecompressed384_64bit(ptr, *x, *m);
#else
    storecompressed384_32bit(ptr, *x, *m);
#endif
}

#if defined(ARCH_32_BIT)
static really_inline
m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 m[12];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
                     popcount32(m[2]), popcount32(m[3]),
                     popcount32(m[4]), popcount32(m[5]),
                     popcount32(m[6]), popcount32(m[7]),
                     popcount32(m[8]), popcount32(m[9]),
                     popcount32(m[10]), popcount32(m[11]) };
    u32 v[12];

    unpack_bits_32(v, (const u8 *)ptr, bits, 12);

    u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]),
                  expand32(v[4], m[4]), expand32(v[5], m[5]),
                  expand32(v[6], m[6]), expand32(v[7], m[7]),
                  expand32(v[8], m[8]), expand32(v[9], m[9]),
                  expand32(v[10], m[10]), expand32(v[11], m[11]) };

    m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
                  .mid = set4x32(x[7], x[6], x[5], x[4]),
                  .hi = set4x32(x[11], x[10], x[9], x[8]) };
    return xvec;
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a m[6];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]),
                    popcount64(m[4]), popcount64(m[5]) };
    u64a v[6];

    unpack_bits_64(v, (const u8 *)ptr, bits, 6);

#ifdef HAVE_SVE2_BITPERM
    u64a ALIGN_ATTR(16) x[6];
    bdep64x2(x, v, &mvec.lo);
    bdep64x2(&x[2], &v[2], &mvec.mid);
    bdep64x2(&x[4], &v[4], &mvec.hi);
#else
    u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                  expand64(v[2], m[2]), expand64(v[3], m[3]),
                  expand64(v[4], m[4]), expand64(v[5], m[5]) };
#endif

    m384 xvec = { .lo = set2x64(x[1], x[0]),
                  .mid = set2x64(x[3], x[2]),
                  .hi = set2x64(x[5], x[4]) };
    return xvec;
}
#endif

void loadcompressed384(m384 *x, const void *ptr, const m384 *m,
                       UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    *x = loadcompressed384_64bit(ptr, *m);
#else
    *x = loadcompressed384_32bit(ptr, *m);
#endif
}

/*
 * 512-bit store/load.
 */

#if defined(ARCH_32_BIT)
static really_inline
void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 x[16];
    memcpy(x, &xvec, sizeof(xvec));
    u32 m[16];
    memcpy(m, &mvec, sizeof(mvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
                     popcount32(m[2]), popcount32(m[3]),
                     popcount32(m[4]), popcount32(m[5]),
                     popcount32(m[6]), popcount32(m[7]),
                     popcount32(m[8]), popcount32(m[9]),
                     popcount32(m[10]), popcount32(m[11]),
                     popcount32(m[12]), popcount32(m[13]),
                     popcount32(m[14]), popcount32(m[15])};

    // Compress each 32-bit chunk individually.
    u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
                  compress32(x[2], m[2]), compress32(x[3], m[3]),
                  compress32(x[4], m[4]), compress32(x[5], m[5]),
                  compress32(x[6], m[6]), compress32(x[7], m[7]),
                  compress32(x[8], m[8]), compress32(x[9], m[9]),
                  compress32(x[10], m[10]), compress32(x[11], m[11]),
                  compress32(x[12], m[12]), compress32(x[13], m[13]),
                  compress32(x[14], m[14]), compress32(x[15], m[15]) };

    // Write packed data out.
    pack_bits_32(ptr, v, bits, 16);
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a m[8];
    memcpy(m, &mvec, sizeof(mvec));
    u64a x[8];
    memcpy(x, &xvec, sizeof(xvec));

    // Count the number of bits of compressed state we're writing out per
    // chunk.
    u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]),
                    popcount64(m[4]), popcount64(m[5]),
                    popcount64(m[6]), popcount64(m[7]) };

    // Compress each 64-bit chunk individually.
    u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
                  compress64(x[2], m[2]), compress64(x[3], m[3]),
                  compress64(x[4], m[4]), compress64(x[5], m[5]),
                  compress64(x[6], m[6]), compress64(x[7], m[7]) };

    // Write packed data out.
    pack_bits_64(ptr, v, bits, 8);
}
#endif

void storecompressed512(void *ptr, const m512 *x, const m512 *m,
                        UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    storecompressed512_64bit(ptr, *x, *m);
#else
    storecompressed512_32bit(ptr, *x, *m);
#endif
}

#if defined(ARCH_32_BIT)
static really_inline
m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
    // First, decompose our vectors into 32-bit chunks.
    u32 m[16];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
                     popcount32(m[2]), popcount32(m[3]),
                     popcount32(m[4]), popcount32(m[5]),
                     popcount32(m[6]), popcount32(m[7]),
                     popcount32(m[8]), popcount32(m[9]),
                     popcount32(m[10]), popcount32(m[11]),
                     popcount32(m[12]), popcount32(m[13]),
                     popcount32(m[14]), popcount32(m[15]) };
    u32 v[16];

    unpack_bits_32(v, (const u8 *)ptr, bits, 16);

    u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]),
                  expand32(v[4], m[4]), expand32(v[5], m[5]),
                  expand32(v[6], m[6]), expand32(v[7], m[7]),
                  expand32(v[8], m[8]), expand32(v[9], m[9]),
                  expand32(v[10], m[10]), expand32(v[11], m[11]),
                  expand32(v[12], m[12]), expand32(v[13], m[13]),
                  expand32(v[14], m[14]), expand32(v[15], m[15]) };

    m512 xvec;
#if defined(HAVE_AVX512)
    xvec = set32x16(x[15], x[14], x[13], x[12],
                    x[11], x[10], x[9], x[8],
                    x[7], x[6], x[5], x[4],
                    x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
    xvec.lo = set8x32(x[7], x[6], x[5], x[4],
                      x[3], x[2], x[1], x[0]);
    xvec.hi = set8x32(x[15], x[14], x[13], x[12],
                      x[11], x[10], x[9], x[8]);
#else
    xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
    xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
    xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
    xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
#endif
    return xvec;
}
#endif

#if defined(ARCH_64_BIT)
static really_inline
m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
    // First, decompose our vectors into 64-bit chunks.
    u64a m[8];
    memcpy(m, &mvec, sizeof(mvec));

    u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
                    popcount64(m[2]), popcount64(m[3]),
                    popcount64(m[4]), popcount64(m[5]),
                    popcount64(m[6]), popcount64(m[7]) };
    u64a v[8];

    unpack_bits_64(v, (const u8 *)ptr, bits, 8);

#ifdef HAVE_SVE2_BITPERM
    u64a ALIGN_ATTR(16) x[8];
    bdep64x2(x, v, &mvec.lo.lo);
    bdep64x2(&x[2], &v[2], &mvec.lo.hi);
    bdep64x2(&x[4], &v[4], &mvec.hi.lo);
    bdep64x2(&x[6], &v[6], &mvec.hi.hi);
#else
    u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                  expand64(v[2], m[2]), expand64(v[3], m[3]),
                  expand64(v[4], m[4]), expand64(v[5], m[5]),
                  expand64(v[6], m[6]), expand64(v[7], m[7]) };
#endif

#if defined(HAVE_AVX512)
    m512 xvec = set8x64(x[7], x[6], x[5], x[4],
                                 x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
    m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
                  .hi = set4x64(x[7], x[6], x[5], x[4])};
#else
    m512 xvec = { .lo = { set2x64(x[1], x[0]),
                          set2x64(x[3], x[2]) },
                  .hi = { set2x64(x[5], x[4]),
                          set2x64(x[7], x[6]) } };
#endif
    return xvec;
}
#endif

void loadcompressed512(m512 *x, const void *ptr, const m512 *m,
                       UNUSED u32 bytes) {
#if defined(ARCH_64_BIT)
    *x = loadcompressed512_64bit(ptr, *m);
#else
    *x = loadcompressed512_32bit(ptr, *m);
#endif
}