diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c index 5f354270..a86c568d 100644 --- a/src/util/simd_utils.c +++ b/src/util/simd_utils.c @@ -26,6 +26,10 @@ * POSSIBILITY OF SUCH DAMAGE. */ +/** \file + * \brief Lookup tables to support SIMD operations. + */ + #include "simd_utils.h" const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = { @@ -38,3 +42,19 @@ const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = { 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, }; + +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = { + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, +}; diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 8cea458e..3544629f 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -245,47 +245,37 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } +extern const u8 simd_onebit_masks[]; + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 31; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + // switches on bit N in the given vector. static really_inline void setbit128(m128 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - // We should be able to figure out a better way than this. - union { - m128 simd; - u8 bytes[sizeof(m128)]; - } x; - x.simd = *ptr; - - u8 *b = &x.bytes[n / 8]; - *b |= 1U << (n % 8); - - *ptr = x.simd; + *ptr = or128(mask1bit128(n), *ptr); } // switches off bit N in the given vector. static really_inline void clearbit128(m128 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - // We should be able to figure out a better way than this. - union { - m128 simd; - u8 bytes[sizeof(m128)]; - } x; - x.simd = *ptr; - - u8 *b = &x.bytes[n / 8]; - *b &= ~(1U << (n % 8)); - - *ptr = x.simd; + *ptr = andnot128(mask1bit128(n), *ptr); } // tests bit N in the given vector. static really_inline char testbit128(const m128 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - // We should be able to figure out a better way than this. - const char *bytes = (const char *)ptr; - return !!(bytes[n / 8] & (1 << (n % 8))); + const m128 mask = mask1bit128(n); +#if defined(__SSE4_1__) + return !_mm_testz_si128(mask, *ptr); +#else + return isnonzero128(and128(mask, *ptr)); +#endif } // offset must be an immediate @@ -551,6 +541,14 @@ m256 loadbytes256(const void *ptr, unsigned int n) { return a; } +static really_inline +m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 31; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + #if !defined(__AVX2__) // switches on bit N in the given vector. static really_inline @@ -599,42 +597,19 @@ char testbit256(const m256 *ptr, unsigned int n) { // switches on bit N in the given vector. static really_inline void setbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - // We should be able to figure out a better way than this. - union { - m256 simd; - u8 bytes[sizeof(m256)]; - } x; - x.simd = *ptr; - - u8 *b = &x.bytes[n / 8]; - *b |= 1U << (n % 8); - - *ptr = x.simd; + *ptr = or256(mask1bit256(n), *ptr); } -// TODO: can we do this better in avx-land? static really_inline void clearbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - union { - m256 simd; - u8 bytes[sizeof(m256)]; - } x; - x.simd = *ptr; - - u8 *b = &x.bytes[n / 8]; - *b &= ~(1U << (n % 8)); - - *ptr = x.simd; + *ptr = andnot256(mask1bit256(n), *ptr); } // tests bit N in the given vector. static really_inline char testbit256(const m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - const char *bytes = (const char *)ptr; - return !!(bytes[n / 8] & (1 << (n % 8))); + const m256 mask = mask1bit256(n); + return !_mm256_testz_si256(mask, *ptr); } static really_really_inline