diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c
index 5f354270..a86c568d 100644
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -26,6 +26,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/** \file
+ * \brief Lookup tables to support SIMD operations.
+ */
+
 #include "simd_utils.h"
 
 const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
@@ -38,3 +42,19 @@ const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
 };
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = {
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+};
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 8cea458e..3544629f 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -245,47 +245,37 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
+extern const u8 simd_onebit_masks[];
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
 // switches on bit N in the given vector.
 static really_inline
 void setbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or128(mask1bit128(n), *ptr);
 }
 
 // switches off bit N in the given vector.
 static really_inline
 void clearbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot128(mask1bit128(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit128(const m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m128 mask = mask1bit128(n);
+#if defined(__SSE4_1__)
+    return !_mm_testz_si128(mask, *ptr);
+#else
+    return isnonzero128(and128(mask, *ptr));
+#endif
 }
 
 // offset must be an immediate
@@ -551,6 +541,14 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
     return a;
 }
 
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
 #if !defined(__AVX2__)
 // switches on bit N in the given vector.
 static really_inline
@@ -599,42 +597,19 @@ char testbit256(const m256 *ptr, unsigned int n) {
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or256(mask1bit256(n), *ptr);
 }
 
-// TODO: can we do this better in avx-land?
 static really_inline
 void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot256(mask1bit256(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit256(const m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, *ptr);
 }
 
 static really_really_inline