/*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "shufti_common.h"

#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"

static really_inline
const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
                                     const u8 *buf, const m256 low4bits,
                                     const m256 zeroes, const u8 run_len
#ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
#endif
                                     ) {
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z
#ifdef MULTIACCEL_DOUBLE
                                                             , run_len2
#endif
                                                             );
}

const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
                                       const u8 *buf,
                                       const u8 *buf_end, u8 run_len
#ifdef MULTIACCEL_DOUBLE
                                       , u8 run_len2
#endif
                                       ) {
    assert(buf && buf_end);
    assert(buf < buf_end);

    // Slow path for small cases.
    if (buf_end - buf < 32) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }

    const m256 zeroes = zeroes256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask_lo = set2x128(mask_lo);
    const m256 wide_mask_hi = set2x128(mask_hi);
    const u8 *rv;

    size_t min = (size_t)buf % 32;
    assert(buf_end - buf >= 32);

    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf,
                                            low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
                                            , run_len2
#endif
                                            );
    if (rv) {
        return rv;
    }
    buf += (32 - min);

    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.
    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf,
                                        low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
                                        , run_len2
#endif
                                        );
        if (rv) {
            return rv;
        }
        buf += 32;
    }

    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 32);
    chars = loadu256(buf_end - 32);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32,
                                    low4bits, zeroes, run_len
#ifdef MULTIACCEL_DOUBLE
                                    , run_len2
#endif
                                    );
    if (rv) {
        return rv;
    }

    return buf_end;
}