/*
 * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Teddy literal matcher: common runtime procedures.
 */

#ifndef TEDDY_RUNTIME_COMMON_H_
#define TEDDY_RUNTIME_COMMON_H_

#include "fdr_confirm.h"
#include "fdr_confirm_runtime.h"
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/uniform_ops.h"

extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
#if defined(HAVE_AVX2)
extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
#endif

#if defined(HAVE_AVX512VBMI)
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
};
#endif

#ifdef ARCH_64_BIT
#define TEDDY_CONF_TYPE u64a
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
#else
#define TEDDY_CONF_TYPE u32
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
#endif

#define CHECK_HWLM_TERMINATE_MATCHING                                       \
do {                                                                        \
    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
        return HWLM_TERMINATED;                                             \
    }                                                                       \
} while (0);

#define CHECK_FLOOD                                                         \
do {                                                                        \
    if (unlikely(ptr > tryFloodDetect)) {                                   \
        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
                                     &floodBackoff, &control, iterBytes);   \
        CHECK_HWLM_TERMINATE_MATCHING;                                      \
    }                                                                       \
} while (0);

/*
 * \brief Copy a block of [0,15] bytes efficiently.
 *
 * This function is a workaround intended to stop some compilers from
 * synthesizing a memcpy function call out of the copy of a small number of
 * bytes that we do in vectoredLoad128.
 */
static really_inline
void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
    switch (len) {
    case 0:
        break;
    case 1:
        *dst = *src;
        break;
    case 2:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        break;
    case 3:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        dst[2] = src[2];
        break;
    case 4:
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 5:
    case 6:
    case 7:
        /* Perform copy with two overlapping 4-byte chunks. */
        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 8:
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    default:
        /* Perform copy with two overlapping 8-byte chunks. */
        assert(len < 16);
        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    }
}

// Note: p_mask is an output param that initialises a poison mask.
//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
//       m byte 0xff in the beginning, followed by n byte 0x00,
//       then followed by the rest bytes 0xff.
// ptr >= lo:
//     no history.
//     for end/short zone, ptr==lo and start_offset==0
//     for start zone, see below
//          lo         ptr                      hi           hi
//          |----------|-------|----------------|............|
//          -start     0       -start+offset    MIN(avail,16)
// p_mask              ffff..ff0000...........00ffff..........
// ptr < lo:
//     only start zone.
//             history
//          ptr        lo                       hi           hi
//          |----------|-------|----------------|............|
//          0          start   start+offset     end(<=16)
// p_mask   ffff.....ffffff..ff0000...........00ffff..........
static really_inline
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
                     const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    union {
        u8 val8[16];
        m128 val128;
    } u;
    u.val128 = zeroes128();

    uintptr_t copy_start;
    uintptr_t copy_len;

    if (ptr >= lo) { // short/end/start zone
        uintptr_t start = (uintptr_t)(ptr - lo);
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 16) {
            assert(start_offset - start <= 16);
            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
                               + 16 - start_offset + start);
            return loadu128(ptr);
        }
        assert(start_offset - start <= avail);
        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
                           + 16 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
    } else { // start zone
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(len_history, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        uintptr_t i;
        for (i = start - need; i < start; i++) {
            u.val8[i] = buf_history[len_history - (start - i)];
        }
        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
        assert(start + start_offset <= end);
        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
                           + 16 - start - start_offset);
        copy_start = start;
        copy_len = end - start;
    }

    // Runt block from the buffer.
    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);

    return u.val128;
}

#if defined(HAVE_AVX2)
/*
 * \brief Copy a block of [0,31] bytes efficiently.
 *
 * This function is a workaround intended to stop some compilers from
 * synthesizing a memcpy function call out of the copy of a small number of
 * bytes that we do in vectoredLoad256.
 */
static really_inline
void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
    switch (len) {
    case 0:
        break;
    case 1:
        *dst = *src;
        break;
    case 2:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        break;
    case 3:
        unaligned_store_u16(dst, unaligned_load_u16(src));
        dst[2] = src[2];
        break;
    case 4:
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 5:
    case 6:
    case 7:
        /* Perform copy with two overlapping 4-byte chunks. */
        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
        unaligned_store_u32(dst, unaligned_load_u32(src));
        break;
    case 8:
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    case 9:
    case 10:
    case 11:
    case 12:
    case 13:
    case 14:
    case 15:
        /* Perform copy with two overlapping 8-byte chunks. */
        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
        unaligned_store_u64a(dst, unaligned_load_u64a(src));
        break;
    case 16:
        storeu128(dst, loadu128(src));
        break;
    default:
        /* Perform copy with two overlapping 16-byte chunks. */
        assert(len < 32);
        storeu128(dst + len - 16, loadu128(src + len - 16));
        storeu128(dst, loadu128(src));
        break;
    }
}

// Note: p_mask is an output param that initialises a poison mask.
//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
//       m byte 0xff in the beginning, followed by n byte 0x00,
//       then followed by the rest bytes 0xff.
// ptr >= lo:
//     no history.
//     for end/short zone, ptr==lo and start_offset==0
//     for start zone, see below
//          lo         ptr                      hi           hi
//          |----------|-------|----------------|............|
//          -start     0       -start+offset    MIN(avail,32)
// p_mask              ffff..ff0000...........00ffff..........
// ptr < lo:
//     only start zone.
//             history
//          ptr        lo                       hi           hi
//          |----------|-------|----------------|............|
//          0          start   start+offset     end(<=32)
// p_mask   ffff.....ffffff..ff0000...........00ffff..........
static really_inline
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
                     const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    union {
        u8 val8[32];
        m256 val256;
    } u;
    u.val256 = zeroes256();

    uintptr_t copy_start;
    uintptr_t copy_len;

    if (ptr >= lo) { // short/end/start zone
        uintptr_t start = (uintptr_t)(ptr - lo);
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 32) {
            assert(start_offset - start <= 32);
            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
                               + 32 - start_offset + start);
            return loadu256(ptr);
        }
        assert(start_offset - start <= avail);
        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
                           + 32 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
    } else { //start zone
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(len_history, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        uintptr_t i;
        for (i = start - need; i < start; i++) {
            u.val8[i] = buf_history[len_history - (start - i)];
        }
        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
        assert(start + start_offset <= end);
        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
                           + 32 - start - start_offset);
        copy_start = start;
        copy_len = end - start;
    }

    // Runt block from the buffer.
    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);

    return u.val256;
}
#endif // HAVE_AVX2

#if defined(HAVE_AVX512)
// Note: p_mask is an output param that initialises a poison mask.
//       u64a k = ones_u64a << n' >> m'; // m' < n'
//       *p_mask = set_mask_m512(~k);
//       means p_mask is consist of:
//       (n' - m') poison bytes "0xff" at the beginning,
//       followed by (64 - n') valid bytes "0x00",
//       then followed by the rest m' poison bytes "0xff".
// ptr >= lo:
//     no history.
//     for end/short zone, ptr==lo and start_offset==0
//     for start zone, see below
//          lo         ptr                      hi           hi
//          |----------|-------|----------------|............|
//          -start     0       -start+offset    MIN(avail,64)
// p_mask              ffff..ff0000...........00ffff..........
// ptr < lo:
//     only start zone.
//             history
//          ptr        lo                       hi           hi
//          |----------|-------|----------------|............|
//          0          start   start+offset     end(<=64)
// p_mask   ffff.....ffffff..ff0000...........00ffff..........
static really_inline
m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
                     const u32 nMasks) {
    m512 val;

    uintptr_t copy_start;
    uintptr_t copy_len;

    if (ptr >= lo) { // short/end/start zone
        uintptr_t start = (uintptr_t)(ptr - lo);
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 64) {
            assert(start_offset - start <= 64);
            u64a k = ones_u64a << (start_offset - start);
            *p_mask = set_mask_m512(~k);
            return loadu512(ptr);
        }
        assert(start_offset - start <= avail);
        u64a k = ones_u64a << (64 - avail + start_offset - start)
                           >> (64 - avail);
        *p_mask = set_mask_m512(~k);
        copy_start = 0;
        copy_len = avail;
    } else { //start zone
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(hlen, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
        val = loadu_maskz_m512(j, &hbuf[hlen - start]);
        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
        assert(start + start_offset <= end);
        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
        *p_mask = set_mask_m512(~k);
        copy_start = start;
        copy_len = end - start;
    }

    assert(copy_len < 64);
    assert(copy_len > 0);
    u64a j = ones_u64a >> (64 - copy_len) << copy_start;
    val = loadu_mask_m512(val, j, ptr);

    return val;
}
#endif // HAVE_AVX512

static really_inline
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                UNUSED CautionReason reason) {
    u64a confVal = 0;
    const u8 *buf = a->buf;
    size_t len = a->len;
    const u8 *confirm_loc = ptr + byte - 7;
#if defined(HAVE_AVX512VBMI)
    if (likely(confirm_loc >= buf)) {
#else
    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
#endif
        confVal = lv_u64a(confirm_loc, buf, buf + len);
    } else { // r == VECTORING, confirm_loc < buf
        u64a histBytes = a->histBytes;
        confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
        // stitch together confVal and history
        u32 overhang = buf - confirm_loc;
        histBytes >>= 64 - (overhang * 8);
        confVal |= histBytes;
    }
    return confVal;
}

static really_inline
void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
                          const u32 *confBase, CautionReason reason,
                          const struct FDR_Runtime_Args *a, const u8 *ptr,
                          hwlmcb_rv_t *control, u32 *last_match) {
    do  {
        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
        u32 byte = bit / bucket + offset;
        u32 idx  = bit % bucket;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a tmp = 0;
        u64a confVal = getConfVal(a, ptr, byte, reason);
        confWithBit(fdrc, a, ptr - a->buf + byte, control,
                    last_match, confVal, &tmp, 0);
    } while (unlikely(*conf));
}

static really_inline
const m128 *getMaskBase(const struct Teddy *teddy) {
    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}

static really_inline
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
    return (const u64a *)((const u8 *)getMaskBase(teddy)
                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
}

static really_inline
const u32 *getConfBase(const struct Teddy *teddy) {
    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
}

#endif /* TEDDY_RUNTIME_COMMON_H_ */