/*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Shufti: character class acceleration.
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */

#include "shufti.h"
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/unaligned.h"

#include "shufti_common.h"

#include "util/simd_utils_ssse3.h"

/** \brief Naive byte-by-byte implementation. */
static really_inline
const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
                        const u8 *buf_end) {
    assert(buf < buf_end);

    for (buf_end--; buf_end >= buf; buf_end--) {
        u8 c = *buf_end;
        if (lo[c & 0xf] & hi[c >> 4]) {
            break;
        }
    }
    return buf_end;
}

#if !defined(__AVX2__)
/* Normal SSSE3 shufti */

static really_inline
const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffff)) {
        u32 pos = ctz32(~z & 0xffff);
        assert(pos < 16);
        return buf + pos;
    } else {
        return NULL; // no match
    }
}

static really_inline
const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
                   const m128 low4bits, const m128 zeroes) {
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);

    return firstMatch(buf, z);
}

const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                     const u8 *buf_end) {
    assert(buf && buf_end);
    assert(buf < buf_end);

    // Slow path for small cases.
    if (buf_end - buf < 16) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }

    const m128 zeroes = zeroes128();
    const m128 low4bits = _mm_set1_epi8(0xf);
    const u8 *rv;

    size_t min = (size_t)buf % 16;
    assert(buf_end - buf >= 16);

    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf);
    rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
    if (rv) {
        return rv;
    }
    buf += (16 - min);

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.

    const u8 *last_block = buf_end - 16;
    while (buf < last_block) {
        m128 lchars = load128(buf);
        rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
        if (rv) {
            return rv;
        }
        buf += 16;
    }

    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 16);
    chars = loadu128(buf_end - 16);
    rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
    if (rv) {
        return rv;
    }

    return buf_end;
}

static really_inline
const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
#ifdef DEBUG
    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
#endif

    u32 z = movemask128(eq128(t, compare));
    if (unlikely(z != 0xffff)) {
        u32 pos = clz32(~z & 0xffff);
        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
        assert(pos >= 16 && pos < 32);
        return buf + (31 - pos);
    } else {
        return NULL; // no match
    }
}


static really_inline
const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
                   const m128 low4bits, const m128 zeroes) {
    m128 c_lo  = pshufb(mask_lo, GET_LO_4(chars));
    m128 c_hi  = pshufb(mask_hi, GET_HI_4(chars));
    m128 t     = and128(c_lo, c_hi);

#ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
#endif

    return lastMatch(buf, t, zeroes);
}

const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                      const u8 *buf_end) {
    assert(buf && buf_end);
    assert(buf < buf_end);

    // Slow path for small cases.
    if (buf_end - buf < 16) {
        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }

    const m128 zeroes = zeroes128();
    const m128 low4bits = _mm_set1_epi8(0xf);
    const u8 *rv;

    assert(buf_end - buf >= 16);

    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf_end - 16);
    rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
    if (rv) {
        return rv;
    }
    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.

    const u8 *last_block = buf + 16;
    while (buf_end > last_block) {
        buf_end -= 16;
        m128 lchars = load128(buf_end);
        rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes);
        if (rv) {
            return rv;
        }
    }

    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf.
    chars = loadu128(buf);
    rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
    if (rv) {
        return rv;
    }

    return buf - 1;
}

static really_inline
const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
                    m128 chars, const u8 *buf, const m128 low4bits,
                    const m128 ones) {
    m128 chars_lo = GET_LO_4(chars);
    m128 chars_hi = GET_HI_4(chars);
    m128 c_lo  = pshufb(mask1_lo, chars_lo);
    m128 c_hi  = pshufb(mask1_hi, chars_hi);
    m128 t     = or128(c_lo, c_hi);

#ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
#endif

    m128 c2_lo  = pshufb(mask2_lo, chars_lo);
    m128 c2_hi  = pshufb(mask2_hi, chars_hi);
    m128 t2     = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi)));

#ifdef DEBUG
    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
#endif

    u32 z = movemask128(eq128(t2, ones));
    return firstMatch(buf, z);
}

const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
                           m128 mask2_lo, m128 mask2_hi,
                           const u8 *buf, const u8 *buf_end) {
    const m128 ones = ones128();
    const m128 low4bits = _mm_set1_epi8(0xf);
    const u8 *rv;

    size_t min = (size_t)buf % 16;

    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf);
    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
                   chars, buf, low4bits, ones);
    if (rv) {
        return rv;
    }
    buf += (16 - min);

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.

    const u8 *last_block = buf_end - 16;
    while (buf < last_block) {
        m128 lchars = load128(buf);
        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
                       lchars, buf, low4bits, ones);
        if (rv) {
            return rv;
        }
        buf += 16;
    }

    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf_end.
    chars = loadu128(buf_end - 16);
    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
                   chars, buf_end - 16, low4bits, ones);
    if (rv) {
        return rv;
    }

    return buf_end;
}

#else // AVX2 - 256 wide shuftis

static really_inline
const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffffffff)) {
        u32 pos = ctz32(~z);
        assert(pos < 32);
        return buf + pos;
    } else {
        return NULL; // no match
    }
}

static really_inline
const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
                   const m256 low4bits, const m256 zeroes) {
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);

    return firstMatch(buf, z);
}

/* takes 128 bit masks, but operates on 256 bits of data */
const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                     const u8 *buf_end) {
    assert(buf && buf_end);
    assert(buf < buf_end);

    // Slow path for small cases.
    if (buf_end - buf < 32) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }

    const m256 zeroes = zeroes256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask_lo = set2x128(mask_lo);
    const m256 wide_mask_hi = set2x128(mask_hi);
    const u8 *rv;

    size_t min = (size_t)buf % 32;
    assert(buf_end - buf >= 32);

    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
    if (rv) {
        return rv;
    }
    buf += (32 - min);

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.

    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
        if (rv) {
            return rv;
        }
        buf += 32;
    }

    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 32);
    chars = loadu256(buf_end - 32);
    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
    if (rv) {
        return rv;
    }

    return buf_end;
}

static really_inline
const u8 *lastMatch(const u8 *buf, m256 t, m256 compare) {
#ifdef DEBUG
    DEBUG_PRINTF("confirming match in:"); dumpMsk256(t); printf("\n");
#endif

    u32 z = movemask256(eq256(t, compare));
    if (unlikely(z != 0xffffffff)) {
        u32 pos = clz32(~z);
        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
        return buf + (31 - pos);
    } else {
        return NULL; // no match
    }
}

static really_inline
const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
                   const m256 low4bits, const m256 zeroes) {
    m256 c_lo  = vpshufb(mask_lo, GET_LO_4(chars));
    m256 c_hi  = vpshufb(mask_hi, GET_HI_4(chars));
    m256 t     = and256(c_lo, c_hi);

#ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
#endif

    return lastMatch(buf, t, zeroes);
}

/* takes 128 bit masks, but operates on 256 bits of data */
const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                      const u8 *buf_end) {
    assert(buf && buf_end);
    assert(buf < buf_end);

    // Slow path for small cases.
    if (buf_end - buf < 64) {
        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }

    const m256 zeroes = zeroes256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask_lo = set2x128(mask_lo);
    const m256 wide_mask_hi = set2x128(mask_hi);
    const u8 *rv;

    assert(buf_end - buf >= 32);

    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf_end - 32);
    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
    if (rv) {
        return rv;
    }
    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.
    const u8 *last_block = buf + 32;
    while (buf_end > last_block) {
        buf_end -= 32;
        m256 lchars = load256(buf_end);
        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
        if (rv) {
            return rv;
        }
    }

    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf.
    chars = loadu256(buf);
    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
    if (rv) {
        return rv;
    }

    return buf - 1;
}

static really_inline
const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
                    m256 chars, const u8 *buf, const m256 low4bits,
                    const m256 ones) {
    DEBUG_PRINTF("buf %p\n", buf);
    m256 chars_lo = GET_LO_4(chars);
    m256 chars_hi = GET_HI_4(chars);
    m256 c_lo  = vpshufb(mask1_lo, chars_lo);
    m256 c_hi  = vpshufb(mask1_hi, chars_hi);
    m256 t     = or256(c_lo, c_hi);

#ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
#endif

    m256 c2_lo  = vpshufb(mask2_lo, chars_lo);
    m256 c2_hi  = vpshufb(mask2_hi, chars_hi);
    m256 t2     = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi)));

#ifdef DEBUG
    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
#endif
    u32 z = movemask256(eq256(t2, ones));

    return firstMatch(buf, z);
}

/* takes 128 bit masks, but operates on 256 bits of data */
const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
                           m128 mask2_lo, m128 mask2_hi,
                           const u8 *buf, const u8 *buf_end) {
    if (buf_end - buf < 32) {
        // not worth it
        return buf;
    }
    const m256 ones = ones256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask1_lo = set2x128(mask1_lo);
    const m256 wide_mask1_hi = set2x128(mask1_hi);
    const m256 wide_mask2_lo = set2x128(mask2_lo);
    const m256 wide_mask2_hi = set2x128(mask2_hi);
    const u8 *rv;

    size_t min = (size_t)buf % 32;

    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
                   chars, buf, low4bits, ones);
    if (rv) {
        return rv;
    }
    buf += (32 - min);

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.
    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
                       lchars, buf, low4bits, ones);
        if (rv) {
            return rv;
        }
        buf += 32;
    }

    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    chars = loadu256(buf_end - 32);
    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
                   chars, buf_end - 32, low4bits, ones);
    if (rv) {
        return rv;
    }

    return buf_end;
}

#endif //AVX2