vectorscan/src/util/simd_utils_ssse3.h
2015-10-20 09:13:35 +11:00

154 lines
4.4 KiB
C

/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD primitives specifically for Intel SSSE3 platforms.
*/
#ifndef SIMD_UTILS_SSSE3_H_E27DF795C9AA02
#define SIMD_UTILS_SSSE3_H_E27DF795C9AA02
#if !defined(_WIN32) && !defined(__SSSE3__)
#error SSSE3 instructions must be enabled
#endif
#include "simd_utils.h"
#include "ue2common.h"
// we may already have x86intrin.h
#if !defined(USE_X86INTRIN_H)
#if defined(HAVE_C_INTRIN_H)
#include <intrin.h>
#elif defined(HAVE_TMMINTRIN_H)
#include <tmmintrin.h> // SSSE3 intrinsics
#else
#define I_HAVE_BROKEN_INTRINSICS
#endif
#endif
#if !defined(I_HAVE_BROKEN_INTRINSICS)
// newish compilers get this right
#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
#else
// must be inline, even in weak-sauce debug builds.
// oldish compilers either don't have the intrinsic, or force one arg through memory
static really_really_inline
m128 palignr(m128 r, m128 l, const int offset) {
__asm__ ("palignr %2,%1,%0" : "+x"(r) : "x"(l), "i"(offset));
return r;
}
#endif
static really_inline
m128 pshufb(m128 a, m128 b) {
m128 result;
#if !defined(I_HAVE_BROKEN_INTRINSICS)
result = _mm_shuffle_epi8(a, b);
#else
__asm__("pshufb\t%1,%0" : "=x"(result) : "xm"(b), "0"(a));
#endif
return result;
}
#if defined(__AVX2__)
static really_inline
m256 vpshufb(m256 a, m256 b) {
return _mm256_shuffle_epi8(a, b);
}
#if defined(USE_GCC_COMPOUND_STATEMENTS)
#define vpalignr(r, l, offset) ({ \
m256 res = _mm256_alignr_epi8(r, l, offset); \
res; \
})
#else
static really_inline
m256 vpalignr(m256 r, m256 l, const int offset) {
return _mm256_alignr_epi8(r, l, offset);
}
#endif
#else // not __AVX2__
static really_inline
m256 vpshufb(m256 a, m256 b) {
m256 rv;
rv.lo = pshufb(a.lo, b.lo);
rv.hi = pshufb(a.hi, b.hi);
return rv;
}
/* palignr requires the offset to be an immediate, which we can do with a
* compound macro, otherwise we have to enumerate the offsets and hope the
* compiler can throw the rest away. */
#if defined(USE_GCC_COMPOUND_STATEMENTS)
#define vpalignr(r, l, offset) ({ \
m256 res; \
res.lo = palignr(r.lo, l.lo, offset); \
res.hi = palignr(r.hi, l.hi, offset); \
res; \
})
#else
#define VPALIGN_CASE(N) case N: \
res.lo = palignr(r.lo, l.lo, N); \
res.hi = palignr(r.hi, l.hi, N); \
return res;
static really_inline
m256 vpalignr(m256 r, m256 l, const int offset) {
m256 res;
switch (offset) {
VPALIGN_CASE(0)
VPALIGN_CASE(1)
VPALIGN_CASE(2)
VPALIGN_CASE(3)
VPALIGN_CASE(4)
VPALIGN_CASE(5)
VPALIGN_CASE(6)
VPALIGN_CASE(7)
VPALIGN_CASE(8)
VPALIGN_CASE(9)
VPALIGN_CASE(10)
VPALIGN_CASE(11)
VPALIGN_CASE(12)
VPALIGN_CASE(13)
VPALIGN_CASE(14)
VPALIGN_CASE(15)
default:
assert(0);
return zeroes256();
}
}
#undef VPALIGN_CASE
#endif
#endif // __AVX2__
#endif /* SIMD_UTILS_SSSE3_H_E27DF795C9AA02 */