mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
simd: simplify the set-all-ones util funcs
Modern compilers (gcc, icc) get this right, with the benefit of removing our last use of inline asm in this file.
This commit is contained in:
parent
4d6934fc77
commit
adf820bbba
@ -88,10 +88,6 @@
|
|||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#define NO_ASM
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Fallback to identity case.
|
// Fallback to identity case.
|
||||||
#ifndef assume_aligned
|
#ifndef assume_aligned
|
||||||
#define assume_aligned(x, y) (x)
|
#define assume_aligned(x, y) (x)
|
||||||
@ -106,13 +102,12 @@ extern const char vbs_mask_data[];
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static really_inline m128 ones128(void) {
|
static really_inline m128 ones128(void) {
|
||||||
#if !defined(NO_ASM)
|
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||||
// trick from Intel's optimization guide to generate all-ones. We have to
|
/* gcc gets this right */
|
||||||
// use inline asm, as there's no intrinsic for this yet.
|
return _mm_set1_epi8(0xFF);
|
||||||
m128 ret;
|
|
||||||
__asm__ ("pcmpeqb %0,%0" : "=x"(ret));
|
|
||||||
return ret;
|
|
||||||
#else
|
#else
|
||||||
|
/* trick from Intel's optimization guide to generate all-ones.
|
||||||
|
* ICC converts this to the single cmpeq instruction */
|
||||||
return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
|
return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -172,19 +167,8 @@ static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
|
|||||||
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
|
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
|
||||||
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
||||||
|
|
||||||
// We found that this generated better code with gcc-4.1 and with the default
|
|
||||||
// tuning settings on gcc-4.4 than just using the _mm_set1_epi8() instrinsic.
|
|
||||||
static really_inline m128 set16x8(u8 c) {
|
static really_inline m128 set16x8(u8 c) {
|
||||||
#if !defined(__AVX2__)
|
|
||||||
m128 a = _mm_cvtsi32_si128((int)c);
|
|
||||||
a = _mm_unpacklo_epi8(a, a);
|
|
||||||
a = _mm_unpacklo_epi8(a, a);
|
|
||||||
a = _mm_shuffle_epi32(a, 0);
|
|
||||||
return a;
|
|
||||||
#else
|
|
||||||
// uses a broadcast for much win
|
|
||||||
return _mm_set1_epi8(c);
|
return _mm_set1_epi8(c);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline u32 movd(const m128 in) {
|
static really_inline u32 movd(const m128 in) {
|
||||||
@ -369,8 +353,7 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
|||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m256 set32x8(u32 in) {
|
m256 set32x8(u32 in) {
|
||||||
m128 a = _mm_cvtsi32_si128(in);
|
return _mm256_set1_epi8(in);
|
||||||
return _mm256_broadcastb_epi8(a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
|
#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
|
||||||
@ -423,12 +406,7 @@ static really_inline m256 zeroes256(void) {
|
|||||||
|
|
||||||
static really_inline m256 ones256(void) {
|
static really_inline m256 ones256(void) {
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
m256 rv;
|
m256 rv = _mm256_set1_epi8(0xFF);
|
||||||
#if defined(NO_ASM)
|
|
||||||
rv = eq256(zeroes256(), zeroes256());
|
|
||||||
#else
|
|
||||||
__asm__ ("vpcmpeqb %0,%0,%0" : "=x"(rv));
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
m256 rv = {ones128(), ones128()};
|
m256 rv = {ones128(), ones128()};
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user