vectorscan/unit/internal/simd_utils.cpp

/*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include "gtest/gtest.h"
#include "util/arch.h"
#include "util/bytecode_ptr.h"
#include "util/simd_utils.h"

#ifdef setbit
#undef setbit
#endif

using namespace std;
using namespace ue2;

namespace {

// Switch one bit on in a bitmask.
template<class Mask>
Mask setbit(unsigned int bit) {
    union {
        Mask simd;
        char bytes[sizeof(Mask)];
    } cf;

    memset(cf.bytes, 0, sizeof(Mask));

    unsigned int byte_idx = bit / 8;
    cf.bytes[byte_idx] = 1U << (bit % 8);

    return cf.simd;
}

// Parameterized tests follow!
//
// Irritatingly we have to define a whole bunch of overrides here... because
// templates. One Admiration Unit for anyone able to build a better way of
// doing this.

struct simd_zeroes {
    operator m128() { return zeroes128(); }
    operator m256() { return zeroes256(); }
    operator m384() { return zeroes384(); }
    operator m512() { return zeroes512(); }
};

struct simd_ones {
    operator m128() { return ones128(); }
    operator m256() { return ones256(); }
    operator m384() { return ones384(); }
    operator m512() { return ones512(); }
};

bool simd_diff(const m128 &a, const m128 &b) { return !!diff128(a, b); }
bool simd_diff(const m256 &a, const m256 &b) { return !!diff256(a, b); }
bool simd_diff(const m384 &a, const m384 &b) { return !!diff384(a, b); }
bool simd_diff(const m512 &a, const m512 &b) { return !!diff512(a, b); }
bool simd_isnonzero(const m128 &a) { return !!isnonzero128(a); }
bool simd_isnonzero(const m256 &a) { return !!isnonzero256(a); }
bool simd_isnonzero(const m384 &a) { return !!isnonzero384(a); }
bool simd_isnonzero(const m512 &a) { return !!isnonzero512(a); }
m128 simd_and(const m128 &a, const m128 &b) { return and128(a, b); }
m256 simd_and(const m256 &a, const m256 &b) { return and256(a, b); }
m384 simd_and(const m384 &a, const m384 &b) { return and384(a, b); }
m512 simd_and(const m512 &a, const m512 &b) { return and512(a, b); }
m128 simd_or(const m128 &a, const m128 &b) { return or128(a, b); }
m256 simd_or(const m256 &a, const m256 &b) { return or256(a, b); }
m384 simd_or(const m384 &a, const m384 &b) { return or384(a, b); }
m512 simd_or(const m512 &a, const m512 &b) { return or512(a, b); }
m128 simd_xor(const m128 &a, const m128 &b) { return xor128(a, b); }
m256 simd_xor(const m256 &a, const m256 &b) { return xor256(a, b); }
m384 simd_xor(const m384 &a, const m384 &b) { return xor384(a, b); }
m512 simd_xor(const m512 &a, const m512 &b) { return xor512(a, b); }
m128 simd_andnot(const m128 &a, const m128 &b) { return andnot128(a, b); }
m256 simd_andnot(const m256 &a, const m256 &b) { return andnot256(a, b); }
m384 simd_andnot(const m384 &a, const m384 &b) { return andnot384(a, b); }
m512 simd_andnot(const m512 &a, const m512 &b) { return andnot512(a, b); }
m128 simd_not(const m128 &a) { return not128(a); }
m256 simd_not(const m256 &a) { return not256(a); }
m384 simd_not(const m384 &a) { return not384(a); }
m512 simd_not(const m512 &a) { return not512(a); }
void simd_clearbit(m128 *a, unsigned int i) { return clearbit128(a, i); }
void simd_clearbit(m256 *a, unsigned int i) { return clearbit256(a, i); }
void simd_clearbit(m384 *a, unsigned int i) { return clearbit384(a, i); }
void simd_clearbit(m512 *a, unsigned int i) { return clearbit512(a, i); }
void simd_setbit(m128 *a, unsigned int i) { return setbit128(a, i); }
void simd_setbit(m256 *a, unsigned int i) { return setbit256(a, i); }
void simd_setbit(m384 *a, unsigned int i) { return setbit384(a, i); }
void simd_setbit(m512 *a, unsigned int i) { return setbit512(a, i); }
bool simd_testbit(const m128 &a, unsigned int i) { return testbit128(a, i); }
bool simd_testbit(const m256 &a, unsigned int i) { return testbit256(a, i); }
bool simd_testbit(const m384 &a, unsigned int i) { return testbit384(a, i); }
bool simd_testbit(const m512 &a, unsigned int i) { return testbit512(a, i); }
u32 simd_diffrich(const m128 &a, const m128 &b) { return diffrich128(a, b); }
u32 simd_diffrich(const m256 &a, const m256 &b) { return diffrich256(a, b); }
u32 simd_diffrich(const m384 &a, const m384 &b) { return diffrich384(a, b); }
u32 simd_diffrich(const m512 &a, const m512 &b) { return diffrich512(a, b); }
u32 simd_diffrich64(const m128 &a, const m128 &b) { return diffrich64_128(a, b); }
u32 simd_diffrich64(const m256 &a, const m256 &b) { return diffrich64_256(a, b); }
u32 simd_diffrich64(const m384 &a, const m384 &b) { return diffrich64_384(a, b); }
u32 simd_diffrich64(const m512 &a, const m512 &b) { return diffrich64_512(a, b); }
void simd_store(void *ptr, const m128 &a) { store128(ptr, a); }
void simd_store(void *ptr, const m256 &a) { store256(ptr, a); }
void simd_store(void *ptr, const m384 &a) { store384(ptr, a); }
void simd_store(void *ptr, const m512 &a) { store512(ptr, a); }
void simd_load(m128 *a, const void *ptr) { *a = load128(ptr); }
void simd_load(m256 *a, const void *ptr) { *a = load256(ptr); }
void simd_load(m384 *a, const void *ptr) { *a = load384(ptr); }
void simd_load(m512 *a, const void *ptr) { *a = load512(ptr); }
void simd_loadu(m128 *a, const void *ptr) { *a = loadu128(ptr); }
void simd_loadu(m256 *a, const void *ptr) { *a = loadu256(ptr); }
void simd_loadu(m384 *a, const void *ptr) { *a = loadu384(ptr); }
void simd_loadu(m512 *a, const void *ptr) { *a = loadu512(ptr); }
void simd_storebytes(void *ptr, const m128 &a, unsigned i) { storebytes128(ptr, a, i); }
void simd_storebytes(void *ptr, const m256 &a, unsigned i) { storebytes256(ptr, a, i); }
void simd_storebytes(void *ptr, const m384 &a, unsigned i) { storebytes384(ptr, a, i); }
void simd_storebytes(void *ptr, const m512 &a, unsigned i) { storebytes512(ptr, a, i); }
void simd_loadbytes(m128 *a, const void *ptr, unsigned i) { *a = loadbytes128(ptr, i); }
void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); }
m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }

template<typename T>
class SimdUtilsTest : public testing::Test {
    // empty
};

typedef ::testing::Types<m128, m256, m384, m512> SimdTypes;
TYPED_TEST_CASE(SimdUtilsTest, SimdTypes);

//
// The tests themselves.
//

TYPED_TEST(SimdUtilsTest, zero) {
    const TypeParam zeroes = simd_zeroes();

    // Should have no bits on.
    char cmp[sizeof(zeroes)];
    memset(cmp, 0, sizeof(zeroes));
    ASSERT_EQ(0, memcmp(cmp, &zeroes, sizeof(zeroes)));
}

TYPED_TEST(SimdUtilsTest, ones) {
    const TypeParam ones = simd_ones();

    // Should have all bits on.
    char cmp[sizeof(ones)];
    memset(cmp, 0xff, sizeof(ones));
    ASSERT_EQ(0, memcmp(cmp, &ones, sizeof(ones)));
}

TYPED_TEST(SimdUtilsTest, and1) {
    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    TypeParam result;

    result = simd_and(zeroes, ones);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_and(ones, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_and(zeroes, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_and(ones, ones);
    EXPECT_FALSE(simd_diff(result, ones));
}

TYPED_TEST(SimdUtilsTest, and2) {
    TypeParam a, b;
    memset(&a, 0x33, sizeof(a));
    memset(&b, 0x55, sizeof(b));

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;
    c.simd = simd_and(a, b);

    const char expected = 0x33 & 0x55;
    for (size_t i = 0; i < sizeof(c); i++) {
        EXPECT_EQ(expected, c.bytes[i]);
    }
}

TEST(SimdUtils, diff256) {
    const unsigned total_bits = 256;

    // Test identical cases
    ASSERT_EQ(0U, diff256(zeroes256(), zeroes256()));
    ASSERT_EQ(0U, diff256(ones256(), ones256()));
    for (unsigned i = 0; i < total_bits; i++) {
        m256 a = setbit<m256>(i);
        m256 b = setbit<m256>(i);
        ASSERT_EQ(0U, diff256(a, b));
    }

    // Cases that differ in one 32-bit word
    for (unsigned i = 0; i < total_bits; i++) {
        m256 a = setbit<m256>(i);
        u32 rv = diff256(zeroes256(), a);
        ASSERT_EQ(1U, rv);
    }
}

TYPED_TEST(SimdUtilsTest, or1) {
    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    TypeParam result;

    result = simd_or(zeroes, ones);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_or(ones, zeroes);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_or(zeroes, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_or(ones, ones);
    EXPECT_FALSE(simd_diff(result, ones));
}

TYPED_TEST(SimdUtilsTest, or2) {
    TypeParam a, b;
    memset(&a, 0x33, sizeof(a));
    memset(&b, 0x55, sizeof(b));

    for (unsigned j = 0; j < 8; j++) {
        for (unsigned i = 0; i < 32; i++) {
            m256 x = setbit<m256>(j*32+i);
            m256 y = zeroes256();
            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy";
        }
    }

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;
    c.simd = simd_or(a, b);

    const char expected = 0x33 | 0x55;
    for (size_t i = 0; i < sizeof(c); i++) {
        EXPECT_EQ(expected, c.bytes[i]);
    }
}

TYPED_TEST(SimdUtilsTest, xor1) {
    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    TypeParam result;

    result = simd_xor(zeroes, ones);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_xor(ones, zeroes);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_xor(zeroes, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_xor(ones, ones);
    EXPECT_FALSE(simd_diff(result, zeroes));
}

TYPED_TEST(SimdUtilsTest, xor2) {
    TypeParam a, b;
    memset(&a, 0x33, sizeof(a));
    memset(&b, 0x55, sizeof(b));

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;
    c.simd = simd_xor(a, b);

    const char expected = 0x33 ^ 0x55;
    for (size_t i = 0; i < sizeof(c); i++) {
        EXPECT_EQ(expected, c.bytes[i]);
    }
}

TYPED_TEST(SimdUtilsTest, andnot1) {
    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    TypeParam result;

    result = simd_andnot(zeroes, ones);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_andnot(ones, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_andnot(zeroes, zeroes);
    EXPECT_FALSE(simd_diff(result, zeroes));

    result = simd_andnot(ones, ones);
    EXPECT_FALSE(simd_diff(result, zeroes));
}

TYPED_TEST(SimdUtilsTest, andnot2) {
    TypeParam a, b;
    memset(&a, 0x33, sizeof(a));
    memset(&b, 0x55, sizeof(b));

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;
    c.simd = simd_andnot(a, b);

    const char expected = ~0x33 & 0x55;
    for (size_t i = 0; i < sizeof(c); i++) {
        EXPECT_EQ(expected, c.bytes[i]);
    }
}

TYPED_TEST(SimdUtilsTest, not1) {
    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    TypeParam result;

    result = simd_not(zeroes);
    EXPECT_FALSE(simd_diff(result, ones));

    result = simd_not(ones);
    EXPECT_FALSE(simd_diff(result, zeroes));
}

TYPED_TEST(SimdUtilsTest, not2) {
    TypeParam a;
    memset(&a, 0x33, sizeof(a));

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;
    c.simd = simd_not(a);

    const char expected = ~0x33;
    for (size_t i = 0; i < sizeof(c); i++) {
        EXPECT_EQ(expected, c.bytes[i]);
    }
}

TYPED_TEST(SimdUtilsTest, isnonzero) {
    TypeParam a = simd_zeroes();
    EXPECT_FALSE(simd_isnonzero(a));

    a = simd_ones();
    EXPECT_TRUE(simd_isnonzero(a));

    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } c;

    // Try every 1-bit case.
    for (size_t i = 0; i < sizeof(a); i++) {
        for (size_t j = 0; j < 8; j++) {
            memset(&c.simd, 0, sizeof(c.simd));
            c.bytes[i] = 1 << j;
            EXPECT_TRUE(simd_isnonzero(c.simd));
        }
    }
}

TYPED_TEST(SimdUtilsTest, clearbit) {
    const unsigned int total_bits = sizeof(TypeParam) * 8;

    const TypeParam ones = simd_ones();

    for (unsigned int i = 0; i < total_bits; i++) {
        TypeParam a = simd_ones();
        simd_clearbit(&a, i);
        ASSERT_NE(0, simd_diff(a, ones)) << "bit " << i << " wasn't cleared";

        TypeParam mask = setbit<TypeParam>(i);
        ASSERT_EQ(0, simd_diff(ones, simd_or(a, mask)))
            << "clearing bit " << i << " caused collateral damage";
    }
}

TYPED_TEST(SimdUtilsTest, testbit) {
    const unsigned int total_bits = sizeof(TypeParam) * 8;

    const TypeParam ones = simd_ones();

    // First, all bits are on in 'ones'.
    for (unsigned int i = 0; i < total_bits; i++) {
        ASSERT_EQ(1, simd_testbit(ones, i)) << "bit " << i << " is on";
    }

    // Try individual bits; only 'i' should be on.
    for (unsigned int i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        for (unsigned int j = 0; j < total_bits; j++) {
            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i
                                                          << " is wrong";
        }
    }
}

TYPED_TEST(SimdUtilsTest, setbit) {
    const unsigned int total_bits = sizeof(TypeParam) * 8;

    // Try individual bits; only 'i' should be on.
    for (unsigned int i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        TypeParam x = simd_zeroes();
        simd_setbit(&x, i);
        ASSERT_FALSE(simd_diff(a, x));
    }

    TypeParam a = simd_zeroes();

    // turn on all bits
    for (unsigned int i = 0; i < total_bits; i++) {
        simd_setbit(&a, i);
    }
    ASSERT_FALSE(simd_diff(simd_ones(), a));

}

TYPED_TEST(SimdUtilsTest, diffrich) {
    const unsigned total_bits = sizeof(TypeParam) * 8;

    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    // Test identical cases
    EXPECT_EQ(0U, simd_diffrich(zeroes, zeroes));
    EXPECT_EQ(0U, simd_diffrich(ones, ones));
    for (unsigned i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        TypeParam b = setbit<TypeParam>(i);
        EXPECT_EQ(0U, simd_diffrich(a, b));
    }

    // and nothing is on in zeroes
    for (unsigned int i = 0; i < total_bits; i++) {
        ASSERT_EQ(0, simd_testbit(zeroes, i)) << "bit " << i << " is off";
    }

    // All-zeroes and all-ones differ in all words
    EXPECT_EQ((1U << (total_bits / 32)) - 1, simd_diffrich(zeroes, ones));

    // Cases that differ in one 32-bit word
    for (unsigned i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        u32 rv = simd_diffrich(zeroes, a);
        EXPECT_EQ(1U << i / 32, rv);
    }
}

TYPED_TEST(SimdUtilsTest, diffrich64) {
    const unsigned total_bits = sizeof(TypeParam) * 8;

    const TypeParam zeroes = simd_zeroes();
    const TypeParam ones = simd_ones();

    // Test identical cases
    EXPECT_EQ(0U, simd_diffrich64(zeroes, zeroes));
    EXPECT_EQ(0U, simd_diffrich64(ones, ones));
    for (unsigned i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        TypeParam b = setbit<TypeParam>(i);
        EXPECT_EQ(0U, simd_diffrich64(a, b));
    }

    // All-zeroes and all-ones differ in all words, which will result in every
    // second bit being on.
    EXPECT_EQ(((1U << (total_bits / 32)) - 1) & 0x55555555u,
              simd_diffrich64(zeroes, ones));

    // Cases that differ in one 64-bit word
    for (unsigned i = 0; i < total_bits; i++) {
        TypeParam a = setbit<TypeParam>(i);
        u32 rv = simd_diffrich64(zeroes, a);
        EXPECT_EQ(1U << ((i / 64) * 2), rv);
    }
}

// Unaligned load
TYPED_TEST(SimdUtilsTest, loadu) {
    const TypeParam ones = simd_ones();

    const size_t mem_len = sizeof(ones) * 2;
    unique_ptr<char[]> mem_array = std::make_unique<char[]>(mem_len);
    char *mem = mem_array.get();

    for (size_t offset = 1; offset < sizeof(ones); offset++) {
        memset(mem, 0, mem_len);
        memset(mem + offset, 0xff, sizeof(ones));
        TypeParam a;
        simd_loadu(&a, mem + offset);
        ASSERT_EQ(0, simd_diff(a, ones));
    }
}

// Aligned load and store
TYPED_TEST(SimdUtilsTest, load_store) {
    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } a;
    for (size_t i = 0; i < sizeof(a); i++) {
        a.bytes[i] = (char)(i % 256);
    }

    auto mem_ptr = make_bytecode_ptr<char>(sizeof(a), alignof(TypeParam));
    char *mem = mem_ptr.get();

    ASSERT_EQ(0, (size_t)mem % 16U);

    memset(mem, 0, sizeof(a));

    simd_store(mem, a.simd);
    ASSERT_EQ(0, memcmp(mem, a.bytes, sizeof(a)));

    TypeParam b;
    simd_load(&b, mem);
    ASSERT_FALSE(simd_diff(a.simd, b));
}

// Packed load and store
TYPED_TEST(SimdUtilsTest, loadbytes_storebytes) {
    union {
        TypeParam simd;
        char bytes[sizeof(TypeParam)];
    } a;
    for (size_t i = 0; i < sizeof(a); i++) {
        a.bytes[i] = (char)(i % 256);
    }

    char mem[sizeof(TypeParam)];
    for (size_t i = 1; i < sizeof(TypeParam); i++) {
        memset(mem, 0xff, sizeof(TypeParam));

        simd_storebytes(mem, a.simd, i);

        union {
            TypeParam simd;
            char bytes[sizeof(TypeParam)];
        } b;
        simd_loadbytes(&b.simd, mem, i);

        // First i bytes should match a, remaining bytes are zero. (Note that
        // this takes endianness into account)
        for (size_t j = 0; j < sizeof(TypeParam); j++) {
            size_t idx = j;
            ASSERT_EQ(j < i ? a.bytes[idx] : 0, b.bytes[idx]);
        }
    }
}

TYPED_TEST(SimdUtilsTest, lshift64) {
    TypeParam a;
    memset(&a, 0x5a, sizeof(a));

    static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL;

    union {
        TypeParam simd;
        u64a qword[sizeof(TypeParam) / 8];
    } c;

    for (unsigned s = 0; s < 64; s++) {
        c.simd = simd_lshift64(a, s);

        const u64a expected = exp_val << s;
        for (size_t i = 0; i < sizeof(c) / 8; i++) {
            EXPECT_EQ(expected, c.qword[i]);
        }
    }

    /* Clang 3.4 on FreeBSD 10 crashes on the following - disable for now */
#if !(defined(__FreeBSD__) && defined(__clang__) && __clang_major__ == 3)

    // test immediates
    u64a expected;

    c.simd = simd_lshift64(a, 1);
    expected = exp_val << 1;
    for (size_t i = 0; i < sizeof(c) / 8; i++) {
        EXPECT_EQ(expected, c.qword[i]);
    }

    c.simd = simd_lshift64(a, 2);
    expected = exp_val << 2;
    for (size_t i = 0; i < sizeof(c) / 8; i++) {
        EXPECT_EQ(expected, c.qword[i]);
    }

    c.simd = simd_lshift64(a, 7);
    expected = exp_val << 7;
    for (size_t i = 0; i < sizeof(c) / 8; i++) {
        EXPECT_EQ(expected, c.qword[i]);
    }

    c.simd = simd_lshift64(a, 31);
    expected = exp_val << 31;
    for (size_t i = 0; i < sizeof(c) / 8; i++) {
        EXPECT_EQ(expected, c.qword[i]);
    }
#endif
}

TEST(SimdUtilsTest, alignment) {
    ASSERT_EQ(16, alignof(m128));
    ASSERT_EQ(32, alignof(m256));
    ASSERT_EQ(16, alignof(m384));
    ASSERT_EQ(64, alignof(m512));
}

TEST(SimdUtilsTest, movq) {
    m128 simd;

    simd = ones128();
    u64a r = movq(simd);
    ASSERT_EQ((u64a)(~0), r);

    char cmp[sizeof(m128)];
    memset(cmp, 0x80, sizeof(m128));
    simd = set1_16x8(0x80);
    r = movq(simd);
    ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
    ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));

#if defined(HAVE_SIMD_128_BITS)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
    simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
    int64x2_t a = { 0x123456789abcdefLL, ~0LL };
    simd = vreinterpretq_s32_s64(a);
#elif defined(ARCH_PPC64EL)
    int64x2_t a = {0x123456789abcdefLL, ~0LL };
    simd = (m128) a;
#endif
#endif
    r = movq(simd);
    ASSERT_EQ(r, 0x123456789abcdef);
}


TEST(SimdUtilsTest, set1_16x8) {
    char cmp[sizeof(m128)];

    for (unsigned i = 0; i < 256; i++) {
        m128 simd = set1_16x8(i);
        memset(cmp, i, sizeof(simd));
        ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
    }
}

TEST(SimdUtilsTest, set1_4x32) {
    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
    m128 simd = set1_4x32(cmp[0]);
    ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
}

#if defined(HAVE_SIMD_256_BITS)
TEST(SimdUtilsTest, set32x8) {
    char cmp[sizeof(m256)];

    for (unsigned i = 0; i < 256; i++) {
        m256 simd = set1_32x8(i);
        memset(cmp, i, sizeof(simd));
        ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
    }
}

TEST(SimdUtilsTest, set2x128) {
    char cmp[sizeof(m256)];

    for (unsigned i = 0; i < 256; i++) {
        m128 x = set1_16x8(i);
        m256 y = set1_32x8(i);
        m256 z = set1_2x128(x);
        memset(cmp, i, sizeof(z));
        ASSERT_EQ(0, memcmp(cmp, &z, sizeof(z)));
        ASSERT_EQ(0, memcmp(&y, &z, sizeof(z)));
    }
}
#endif

TEST(SimdUtilsTest, variableByteShift128) {
    char base[] = "0123456789ABCDEF";
    m128 in = loadu128(base);

    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                         variable_byte_shift_m128(in, 0)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
                         variable_byte_shift_m128(in, -1)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
                         variable_byte_shift_m128(in, -2)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
                         variable_byte_shift_m128(in, -3)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
                         variable_byte_shift_m128(in, -4)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
                         variable_byte_shift_m128(in, -5)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
                         variable_byte_shift_m128(in, -6)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
                         variable_byte_shift_m128(in, -7)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
                         variable_byte_shift_m128(in, -8)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
                         variable_byte_shift_m128(in, -9)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
                         variable_byte_shift_m128(in, -10)));

    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
                         variable_byte_shift_m128(in, 0)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
                         variable_byte_shift_m128(in, 1)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
                         variable_byte_shift_m128(in, 2)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
                         variable_byte_shift_m128(in, 3)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
                         variable_byte_shift_m128(in, 4)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
                         variable_byte_shift_m128(in, 5)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
                         variable_byte_shift_m128(in, 6)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
                         variable_byte_shift_m128(in, 7)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
                         variable_byte_shift_m128(in, 8)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
                         variable_byte_shift_m128(in, 9)));
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                         variable_byte_shift_m128(in, 10)));

    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
}

TEST(SimdUtilsTest, max_u8_m128) {
    char base1[] = "0123456789ABCDE\xfe";
    char base2[] = "!!23455889aBCd\xff\xff";
    char expec[] = "0123456889aBCd\xff\xff";
    m128 in1 = loadu128(base1);
    m128 in2 = loadu128(base2);
    m128 result = max_u8_m128(in1, in2);
    EXPECT_TRUE(!diff128(result, loadu128(expec)));
}

TEST(SimdUtilsTest, min_u8_m128) {
    char base1[] = "0123456789ABCDE\xfe";
    char base2[] = "!!23455889aBCd\xff\xff";
    char expec[] = "!!23455789ABCDE\xfe";
    m128 in1 = loadu128(base1);
    m128 in2 = loadu128(base2);
    m128 result = min_u8_m128(in1, in2);
    EXPECT_TRUE(!diff128(result, loadu128(expec)));
}

TEST(SimdUtilsTest, sadd_u8_m128) {
    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
                             '1', '2', '3', '4', '1', '2', '3', '4'};
    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
    unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
    m128 in1 = loadu128(base1);
    m128 in2 = loadu128(base2);
    m128 result = sadd_u8_m128(in1, in2);
    EXPECT_TRUE(!diff128(result, loadu128(expec)));
}

TEST(SimdUtilsTest, sub_u8_m128) {
    unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
                             '1', '2', '3', '4', '1', '2', '3', '4'};
    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
    m128 in1 = loadu128(base1);
    m128 in2 = loadu128(base2);
    m128 result = sub_u8_m128(in1, in2);
    EXPECT_TRUE(!diff128(result, loadu128(expec)));
}

TEST(SimdUtilsTest, load_m128_from_u64a) {
    srand (time(NULL));
    u64a tmp = rand();
    m128 res = load_m128_from_u64a(&tmp);
    m128 cmp = set2x64(0LL, tmp);
    //print_m128_16x8("res",res);
    //print_m128_16x8("cmp",cmp);
    EXPECT_TRUE(!diff128(res, cmp));
}


TEST(SimdUtilsTest, movemask_128) {
    srand (time(NULL));
    u8 vec[16] = {0};
    u8 vec2[16] = {0};
    u16 r = rand() % 100 + 1;
    for(int i=0; i<16; i++) {
        if (r & (1 << i)) {
            vec[i] = 0xff;
        }
    }
    m128 v = loadu128(vec);
    u16 mask = movemask128(v);
    for(int i=0; i<16; i++) {
        if (mask & (1 << i)) {
            vec2[i] = 0xff;
        }
    }
    for (int i=0; i<16; i++) {
        ASSERT_EQ(vec[i],vec2[i]);
    }
}

TEST(SimdUtilsTest, pshufb_m128) {
    srand (time(NULL));
    u8 vec[16];
    for (int i=0; i<16; i++) {
        vec[i] = rand() % 1000 + 1;
    }
    u8 vec2[16];
    for (int i=0; i<16; i++) {
        vec2[i]=i + (rand() % 100 + 0);
    }

    // On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
    // In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
    // Thus bellow we have to check that case to NEON or PPC.

    //Insure that vec3 has at least 1 or more 0x80 elements
    u8 vec3[16] = {0};
    vec3[15] = 0x80;

    for (int i=0; i<15; i++) {
        int l = rand() % 1000 + 0;
        if (l % 16 ==0){
            vec3[i]= 0x80;
        } else{
            vec3[i]= vec2[i];
        }
    }
    /*
        printf("vec3: ");
        for(int i=15; i>=0; i--) { printf("%02x, ", vec3[i]); }
        printf("\n");
    */

    //Test Special Case
    m128 v1 = loadu128(vec);
    m128 v2 = loadu128(vec3);
    m128 vres = pshufb_m128(v1, v2);

    u8 res[16];
    storeu128(res, vres);

    for (int i=0; i<16; i++) {
	if(vec3[i] & 0x80){
	   ASSERT_EQ(res[i], 0);
        }else{
           ASSERT_EQ(vec[vec3[i] % 16 ], res[i]);
	    }
    }

    //Test Other Cases
    v1 = loadu128(vec);
    v2 = loadu128(vec2);
    vres = pshufb_m128(v1, v2);
    storeu128(res, vres);

    for (int i=0; i<16; i++) {
	if(vec2[i] & 0x80){
	   ASSERT_EQ(res[i], 0);
        }else{
           ASSERT_EQ(vec[vec2[i] % 16 ], res[i]);
	    }
    }
}

/*Define ALIGNR128 macro*/
#define TEST_ALIGNR128(v1, v2, buf, l) {                                                 \
                                           m128 v_aligned = palignr(v2,v1, l);           \
                                           storeu128(res, v_aligned);                    \
                                           for (size_t i=0; i<16; i++) {                 \
                                               ASSERT_EQ(res[i], vec[i + l]);            \
                                           }                                             \
                                       }

TEST(SimdUtilsTest, Alignr128){
    u8 vec[32];
    u8 res[16];
    for (int i=0; i<32; i++) {
        vec[i]=i;
    }
    m128 v1 = loadu128(vec);
    m128 v2 = loadu128(vec+16);
    for (int j = 0; j<16; j++){
        TEST_ALIGNR128(v1, v2, vec, j);
    }
}


} // namespace