simd_utils: setbit/clearbit by loading 1-bit mask

2026-01-17 16:00:26 +03:00 · 2016-05-13 09:39:26 +10:00
parent 790683b641
commit 49bb3b5c82
2 changed files with 50 additions and 55 deletions
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -26,6 +26,10 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Lookup tables to support SIMD operations.
 */
 #include "simd_utils.h"
 const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
@@ -38,3 +42,19 @@ const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
 };
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
 /** \brief LUT for the mask1bit functions. */
 const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = {
    ZEROES_31, 0x01, ZEROES_32,
    ZEROES_31, 0x02, ZEROES_32,
    ZEROES_31, 0x04, ZEROES_32,
    ZEROES_31, 0x08, ZEROES_32,
    ZEROES_31, 0x10, ZEROES_32,
    ZEROES_31, 0x20, ZEROES_32,
    ZEROES_31, 0x40, ZEROES_32,
    ZEROES_31, 0x80, ZEROES_32,
 };
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -245,47 +245,37 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
    return a;
 }
 extern const u8 simd_onebit_masks[];
 static really_inline
 m128 mask1bit128(unsigned int n) {
    assert(n < sizeof(m128) * 8);
    u32 mask_idx = ((n % 8) * 64) + 31;
    mask_idx -= n / 8;
    return loadu128(&simd_onebit_masks[mask_idx]);
 }
 // switches on bit N in the given vector.
 static really_inline
 void setbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    *ptr = or128(mask1bit128(n), *ptr);
    // We should be able to figure out a better way than this.
    union {
        m128 simd;
        u8 bytes[sizeof(m128)];
    } x;
    x.simd = *ptr;
    u8 *b = &x.bytes[n / 8];
    *b |= 1U << (n % 8);
    *ptr = x.simd;
 }
 // switches off bit N in the given vector.
 static really_inline
 void clearbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    *ptr = andnot128(mask1bit128(n), *ptr);
    // We should be able to figure out a better way than this.
    union {
        m128 simd;
        u8 bytes[sizeof(m128)];
    } x;
    x.simd = *ptr;
    u8 *b = &x.bytes[n / 8];
    *b &= ~(1U << (n % 8));
    *ptr = x.simd;
 }
 // tests bit N in the given vector.
 static really_inline
 char testbit128(const m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    const m128 mask = mask1bit128(n);
-    // We should be able to figure out a better way than this.
+#if defined(__SSE4_1__)
-    const char *bytes = (const char *)ptr;
+    return !_mm_testz_si128(mask, *ptr);
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+#else
    return isnonzero128(and128(mask, *ptr));
 #endif
 }
 // offset must be an immediate
@@ -551,6 +541,14 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
    return a;
 }
 static really_inline
 m256 mask1bit256(unsigned int n) {
    assert(n < sizeof(m256) * 8);
    u32 mask_idx = ((n % 8) * 64) + 31;
    mask_idx -= n / 8;
    return loadu256(&simd_onebit_masks[mask_idx]);
 }
 #if !defined(__AVX2__)
 // switches on bit N in the given vector.
 static really_inline
@@ -599,42 +597,19 @@ char testbit256(const m256 *ptr, unsigned int n) {
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    *ptr = or256(mask1bit256(n), *ptr);
    // We should be able to figure out a better way than this.
    union {
        m256 simd;
        u8 bytes[sizeof(m256)];
    } x;
    x.simd = *ptr;
    u8 *b = &x.bytes[n / 8];
    *b |= 1U << (n % 8);
    *ptr = x.simd;
 }
 // TODO: can we do this better in avx-land?
 static really_inline
 void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    *ptr = andnot256(mask1bit256(n), *ptr);
    union {
        m256 simd;
        u8 bytes[sizeof(m256)];
    } x;
    x.simd = *ptr;
    u8 *b = &x.bytes[n / 8];
    *b &= ~(1U << (n % 8));
    *ptr = x.simd;
 }
 // tests bit N in the given vector.
 static really_inline
 char testbit256(const m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
+    const m256 mask = mask1bit256(n);
-    const char *bytes = (const char *)ptr;
+    return !_mm256_testz_si256(mask, *ptr);
    return !!(bytes[n / 8] & (1 << (n % 8)));
 }
 static really_really_inline