From 49eb18ee4f21b5bd389e0e9d5644be1ec1dc85c6 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Sun, 26 Jun 2022 22:50:05 +0000
Subject: [PATCH 01/35] Optimize vectorscan for aarch64 by using shrn
 instruction

This optimization is based on the thread
https://twitter.com/Danlark1/status/1539344279268691970 and uses
shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate--

To achieve that, I needed to redesign a little movemask into comparemask
and have an additional step towards mask iteration. Our benchmarks
showed 10-15% improvement on average for long matches.
---
 src/hwlm/noodle_engine_simd.hpp            | 55 ++++++++++------
 src/nfa/limex_shuffle.hpp                  | 14 +++-
 src/util/arch/arm/match.hpp                | 40 ++++++------
 src/util/arch/arm/simd_utils.h             | 27 ++++----
 src/util/arch/ppc64el/match.hpp            |  8 +--
 src/util/arch/x86/match.hpp                | 72 +++++++++++---------
 src/util/supervector/arch/arm/impl.cpp     | 32 ++++-----
 src/util/supervector/arch/ppc64el/impl.cpp | 18 +++--
 src/util/supervector/arch/x86/impl.cpp     | 76 +++++++++++++++-------
 src/util/supervector/supervector.hpp       | 36 +++++++---
 unit/internal/supervector.cpp              | 36 ++++++----
 11 files changed, 264 insertions(+), 150 deletions(-)
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
index c49bfc7e..8006bd79 100644
--- a/src/hwlm/noodle_engine_simd.hpp
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -36,7 +36,7 @@ static really_really_inline
 hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 		Z_TYPE z, size_t len, const struct cb_info *cbi) {
     while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
         size_t matchPos = d - buf + pos;
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
@@ -49,7 +49,7 @@ static really_really_inline
 hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 		Z_TYPE z, size_t len, const struct cb_info *cbi) {
     while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
         size_t matchPos = d - buf + pos - 1;
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
@@ -77,9 +77,11 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
     SuperVector<S> v = SuperVector<S>::Zeroes();
     memcpy(&v.u, d, l);
 
-    typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l);
+    typename SuperVector<S>::comparemask_type mask =
+        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
     v = v & caseMask;
-    typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+    z = SuperVector<S>::iteration_mask(z);
 
     return single_zscan(n, d, buf, z, len, cbi);
 }
@@ -103,9 +105,12 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
         return HWLM_SUCCESS;
     }
     size_t buf_off = start - offset;
-    typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off;
+    typename SuperVector<S>::comparemask_type mask =
+        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+        << (buf_off * SuperVector<S>::mask_width());
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
-    typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+    z = SuperVector<S>::iteration_mask(z);
 
     return single_zscan(n, d, buf, z, len, cbi);
 }
@@ -126,10 +131,13 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
     memcpy(&v.u, d, l);
     v = v & caseMask;
 
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
-    typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+    typename SuperVector<S>::comparemask_type mask =
+        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+    typename SuperVector<S>::comparemask_type z =
+        mask & (z1 << (SuperVector<S>::mask_width())) & z2;
+    z = SuperVector<S>::iteration_mask(z);
 
     return double_zscan(n, d, buf, z, len, cbi);
 }
@@ -148,10 +156,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
     }
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
     size_t buf_off = start - offset;
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off;
-    typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+    typename SuperVector<S>::comparemask_type mask =
+        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+        << (buf_off * SuperVector<S>::mask_width());
+    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+    typename SuperVector<S>::comparemask_type z =
+        mask & (z1 << SuperVector<S>::mask_width()) & z2;
+    z = SuperVector<S>::iteration_mask(z);
 
     return double_zscan(n, d, buf, z, len, cbi);
 }
@@ -191,7 +203,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
             __builtin_prefetch(base + 256);
 
             SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-            typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
+            typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
+            z = SuperVector<S>::iteration_mask(z);
 
             hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
             RETURN_IF_TERMINATED(rv);
@@ -220,7 +233,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
 
     size_t start = offset + n->msk_len - n->key_offset;
 
-    typename SuperVector<S>::movemask_type lastz1{0};
+    typename SuperVector<S>::comparemask_type lastz1{0};
 
     const u8 *d = buf + start;
     const u8 *e = buf + end;
@@ -248,10 +261,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
             __builtin_prefetch(base + 256);
 
             SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-            typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-            typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-            typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
-            lastz1 = z1 >> Z_SHIFT;
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+            typename SuperVector<S>::comparemask_type z =
+                (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+            lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
+            z = SuperVector<S>::iteration_mask(z);
 
             hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
             RETURN_IF_TERMINATED(rv);
diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp
index 4266d7da..367d400b 100644
--- a/src/nfa/limex_shuffle.hpp
+++ b/src/nfa/limex_shuffle.hpp
@@ -53,7 +53,15 @@ really_really_inline
 u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
     SuperVector<16> shuffled = s.pshufb<true>(permute);
     SuperVector<16> compared = shuffled & compare;
-    u16 rv = ~compared.eqmask(shuffled);
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
+    if (SuperVector<16>::mask_width() != 1) {
+        u32 ans = 0;
+        for (u32 i = 0; i < 16; ++i) {
+            ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
+                   (i * SuperVector<16>::mask_width() - i);
+        }
+        return ans;
+    }
     return (u32)rv;
 }
 
@@ -62,7 +70,8 @@ really_really_inline
 u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
     SuperVector<32> shuffled = s.pshufb<true>(permute);
     SuperVector<32> compared = shuffled & compare;
-    u32 rv = ~compared.eqmask(shuffled); 
+    // TODO(danlark1): Future ARM support might have a bug.
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
     return (u32)((rv >> 16) | (rv & 0xffffU));
 }
 
@@ -71,6 +80,7 @@ really_really_inline
 u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
     SuperVector<64> shuffled = s.pshufb<true>(permute);
     SuperVector<64> compared = shuffled & compare;
+    // TODO(danlark1): Future ARM support might have a bug.
     u64a rv = ~compared.eqmask(shuffled);
     rv = rv >> 32 | rv;
     return (u32)(((rv >> 16) | rv) & 0xffffU);
diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp
index 892c3877..1280fed5 100644
--- a/src/util/arch/arm/match.hpp
+++ b/src/util/arch/arm/match.hpp
@@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-    typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("z %08x\n", z);
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        u32 pos = ctz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("z %08llx\n", z);
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
-        DEBUG_PRINTF("buf + pos %p\n", buf + pos);
+        DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
         return buf + pos;
     } else {
         return NULL; // no match
@@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-    typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        DEBUG_PRINTF("z %08x\n", z);
-        u32 pos = clz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        DEBUG_PRINTF("z %08llx\n", z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
+        return buf + (15 - pos);
     } else {
         return NULL; // no match
     }
@@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-	typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("z %08x\n", z);
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        u32 pos = ctz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("z %08llx\n", z);
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         DEBUG_PRINTF("buf + pos %p\n", buf + pos);
@@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-	typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        DEBUG_PRINTF("z %08x\n", z);
-        u32 pos = clz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        DEBUG_PRINTF("z %08llx\n", z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
+        return buf + (15 - pos);
     } else {
         return NULL; // no match
     }
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index e6836b25..68c29c67 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) {
 
 /** \brief Return 1 if a and b are different otherwise 0 */
 static really_inline int diff128(m128 a, m128 b) {
-    int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
-    return (-16 != res);
+    uint64_t res = vget_lane_u64(
+        (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
+    return (~0ull != res);
 }
 
 static really_inline int isnonzero128(m128 a) {
@@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
 }
 
 static really_inline u32 movemask128(m128 a) {
-    uint8x16_t input = vreinterpretq_u8_s32(a);
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+    static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
+                                      1, 2, 4, 8, 16, 32, 64, 128};
+
+    // Compute the mask from the input
+    uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
+        vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+    uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
+    mask = vorrq_u8(mask, mask1);
+
+    // Get the resulting bytes
+    uint16_t output;
+    vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
+    return output;
 }
 
 static really_inline m128 set1_16x8(u8 c) {
diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp
index a3f52e41..4f7cc7f1 100644
--- a/src/util/arch/ppc64el/match.hpp
+++ b/src/util/arch/ppc64el/match.hpp
@@ -30,7 +30,7 @@
 template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z)) {
@@ -47,7 +47,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
 template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z)) {
@@ -63,7 +63,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z != 0xffff)) {
@@ -81,7 +81,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z != 0xffff)) {
diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp
index cbf4ab6b..d237567f 100644
--- a/src/util/arch/x86/match.hpp
+++ b/src/util/arch/x86/match.hpp
@@ -30,12 +30,13 @@
 template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
 template <>
 really_really_inline
 const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
         assert(pos < 32);
@@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U
 template <>
 really_really_inline
 const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le
 template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
 template <>
 really_really_inline
 const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         assert(pos < 32);
@@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN
 template <>
 really_really_inline
 const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z != 0xffffffff)) {
-        u32 pos = ctz32(~z);
+        u32 pos = ctz32(~z & 0xffffffffu);
         assert(pos < 32);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         return buf + pos;
@@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
-        u32 pos = clz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        u32 pos = clz32(~z & 0xffffu);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);
@@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_
 template<>
 really_really_inline
 const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = clz32(~z & 0xffffffff);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
+        u32 pos = clz32(~z & 0xffffffffu);
         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
         assert(pos < 32);
         return buf + (31 - pos);
@@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
+    assert(SuperVector<64>::mask_width() == 1);
     v.print8("v");
-    SuperVector<64>::movemask_type z = v.movemask();
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index 89497d3d..b3e4233e 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
-{
-    SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
-
-    // Compute the mask from the input
-    uint8x16_t mask  = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
-    uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
-    mask = vorrq_u8(mask, (uint8x16_t) mask1);
-
-    // Get the resulting bytes
-    uint16_t output;
-    vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
-    return static_cast<typename SuperVector<16>::movemask_type>(output);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return static_cast<typename SuperVector<16>::comparemask_type>(
+        vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask & 0x1111111111111111ull;
 }
 
 template <>
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 109b8d5e..5becb8f8 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{ 
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
     uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
     
     uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
@@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();  
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
 }
 
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
+}
 
 template <>
 template<uint8_t N>
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index 157f1dc4..c9daf0cf 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{
-    return _mm_movemask_epi8(u.v128[0]);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return (u32)_mm_movemask_epi8(u.v128[0]);
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
 }
 
 // template <>
@@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
-{
-    return _mm256_movemask_epi8(u.v256[0]);
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::comparemask(void) const {
+    return (u32)_mm256_movemask_epi8(u.v256[0]);
 }
 
 template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::eqmask(SuperVector<32> const b) const {
+    return eq(b).comparemask();
 }
 
+template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::iteration_mask(
+    typename SuperVector<32>::comparemask_type mask) {
+    return mask;
+}
 
 // template <>
 // template<uint8_t N>
@@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
@@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::comparemask(void) const {
     __m512i msb = _mm512_set1_epi8(0xFF);
     __m512i mask = _mm512_and_si512(msb, u.v512[0]);
     return _mm512_cmpeq_epi8_mask(mask, msb);
 }
 
 template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::eqmask(SuperVector<64> const b) const {
     return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
 }
 
+template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::iteration_mask(
+    typename SuperVector<64>::comparemask_type mask) {
+    return mask;
+}
+
 // template <>
 // template<uint8_t N>
 // really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index f0ddf63c..51310db2 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -46,19 +46,29 @@
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_SHIFT 63
+#define Z_POSSHIFT 0
 #define DOUBLE_LOAD_MASK(l)        ((~0ULL) >> (Z_BITS -(l)))
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_256_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 31
+#define Z_POSSHIFT 0
 #define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+using Z_TYPE = u64a;
+#define Z_BITS 64
+#define Z_POSSHIFT 2
+#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
+#else
 using Z_TYPE = u32;
 #define Z_BITS 32
+#define Z_POSSHIFT 0
+#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
+#endif
 #define Z_SHIFT 15
-#define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #endif
 
@@ -94,7 +104,8 @@ struct BaseVector
   static constexpr bool      is_valid = false;
   static constexpr u16           size = 8;
   using                          type = void;
-  using                 movemask_type = void;
+  using comparemask_type = void;
+  using cmpmask_type = void;
   static constexpr bool  has_previous = false;
   using                 previous_type = void;
   static constexpr u16  previous_size = 4;
@@ -106,7 +117,7 @@ struct BaseVector<128>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 128;
   using                          type = void;
-  using                 movemask_type = u64a;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m512;
   static constexpr u16  previous_size = 64;
@@ -118,7 +129,7 @@ struct BaseVector<64>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 64;
   using                          type = m512;
-  using                 movemask_type = u64a;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m256;
   static constexpr u16  previous_size = 32;
@@ -131,7 +142,7 @@ struct BaseVector<32>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 32;
   using                          type = m256;
-  using                 movemask_type = u32;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m128;
   static constexpr u16  previous_size = 16;
@@ -144,7 +155,7 @@ struct BaseVector<16>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 16;
   using                          type = m128;
-  using                 movemask_type = u32;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = false;
   using                 previous_type = u64a;
   static constexpr u16  previous_size = 8;
@@ -231,8 +242,17 @@ public:
   SuperVector eq(SuperVector const &b) const;
   SuperVector operator<<(uint8_t const N) const;
   SuperVector operator>>(uint8_t const N) const;
-  typename base_type::movemask_type movemask(void) const;
-  typename base_type::movemask_type eqmask(SuperVector const b) const;
+  // Returns mask_width groups of zeros or ones. To get the mask which can be
+  // iterated, use iteration_mask method, it ensures only one bit is set per
+  // mask_width group.
+  // Precondition: all bytes must be 0 or 0xff.
+  typename base_type::comparemask_type comparemask(void) const;
+  typename base_type::comparemask_type eqmask(SuperVector const b) const;
+  static u32 mask_width();
+  // Returns a mask with at most 1 bit set to 1. It can be used to iterate
+  // over bits through ctz/clz and lowest bit clear.
+  static typename base_type::comparemask_type
+  iteration_mask(typename base_type::comparemask_type mask);
 
   static SuperVector loadu(void const *ptr);
   static SuperVector load(void const *ptr);
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index deb3b169..0b4cae58 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -176,9 +176,9 @@ TEST(SuperVectorUtilsTest,Movemask128c){
         }
     }
     auto SP = SuperVector<16>::loadu(vec);
-    u16 mask = SP.movemask();
-    for(int i=0; i<16; i++) {
-        if (mask & (1 << i)) {
+    u64a mask = SP.comparemask();
+    for (int i = 0; i < 16; i++) {
+        if (mask & (1ull << (i * SuperVector<16>::mask_width()))) {
             vec2[i] = 0xff;
         }
     }
@@ -195,15 +195,21 @@ TEST(SuperVectorUtilsTest,Eqmask128c){
     for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
     auto SP = SuperVector<16>::loadu(vec);
     auto SP1 = SuperVector<16>::loadu(vec2);
-    int mask = SP.eqmask(SP);
-    ASSERT_EQ(mask,0xFFFF);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 16; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);
     vec2[0] = vec[0];
     vec2[1] = vec[1];
     auto SP2 = SuperVector<16>::loadu(vec2);
     mask = SP.eqmask(SP2);
-    ASSERT_EQ(mask,3);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width()));
+    for (u32 i = 2; i < 16; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
 }
 
 /*Define LSHIFT128 macro*/
@@ -507,9 +513,9 @@ TEST(SuperVectorUtilsTest,Movemask256c){
         }
     }
     auto SP = SuperVector<32>::loadu(vec);
-    u32 mask = SP.movemask();
+    u64a mask = SP.comparemask();
     for(int i=0; i<32; i++) {
-        if (mask & (1 << i)) {
+        if (mask & (1ull << (i * SuperVector<32>::mask_width()))) {
             vec2[i] = 0xff;
         }
     }
@@ -527,15 +533,21 @@ TEST(SuperVectorUtilsTest,Eqmask256c){
     for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
     auto SP = SuperVector<32>::loadu(vec);
     auto SP1 = SuperVector<32>::loadu(vec2);
-    u32 mask = SP.eqmask(SP);
-    ASSERT_EQ(mask,0xFFFFFFFF);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 32; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);
     vec2[0] = vec[0];
     vec2[1] = vec[1];
     auto SP2 = SuperVector<32>::loadu(vec2);
     mask = SP.eqmask(SP2);
-    ASSERT_EQ(mask,3);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width()));
+    for (u32 i = 2; i < 32; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
 }
 
 TEST(SuperVectorUtilsTest,pshufb256c) {
@@ -871,6 +883,8 @@ TEST(SuperVectorUtilsTest,Eqmask512c){
     auto SP = SuperVector<64>::loadu(vec);
     auto SP1 = SuperVector<64>::loadu(vec2);
     u64a mask = SP.eqmask(SP);
+    // Mask width for 64 bit type cannot be more than 1.
+    ASSERT_EQ(SuperVector<64>::mask_width(), 1);
     ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);

From 8a49e20bcd504f7bd8cc95d9e6807543296950d8 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Sun, 26 Jun 2022 22:59:58 +0000
Subject: [PATCH 02/35] Fix formatting of a couple files

---
 src/util/arch/arm/simd_utils.h       | 22 +++++++++-------------
 src/util/supervector/supervector.hpp | 11 +++++------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 68c29c67..8d8c4456 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -380,19 +380,15 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
 }
 
 static really_inline u32 movemask128(m128 a) {
-    static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
-                                      1, 2, 4, 8, 16, 32, 64, 128};
-
-    // Compute the mask from the input
-    uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
-        vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
-    uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
-    mask = vorrq_u8(mask, mask1);
-
-    // Get the resulting bytes
-    uint16_t output;
-    vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
-    return output;
+    ruint8x16_t input = vreinterpretq_u8_s32(a);
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
 }
 
 static really_inline m128 set1_16x8(u8 c) {
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 51310db2..5d066c1a 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -104,8 +104,7 @@ struct BaseVector
   static constexpr bool      is_valid = false;
   static constexpr u16           size = 8;
   using                          type = void;
-  using comparemask_type = void;
-  using cmpmask_type = void;
+  using              comparemask_type = void;
   static constexpr bool  has_previous = false;
   using                 previous_type = void;
   static constexpr u16  previous_size = 4;
@@ -117,7 +116,7 @@ struct BaseVector<128>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 128;
   using                          type = void;
-  using comparemask_type = u64a;
+  using              comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m512;
   static constexpr u16  previous_size = 64;
@@ -129,7 +128,7 @@ struct BaseVector<64>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 64;
   using                          type = m512;
-  using comparemask_type = u64a;
+  using              comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m256;
   static constexpr u16  previous_size = 32;
@@ -142,7 +141,7 @@ struct BaseVector<32>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 32;
   using                          type = m256;
-  using comparemask_type = u64a;
+  using              comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m128;
   static constexpr u16  previous_size = 16;
@@ -155,7 +154,7 @@ struct BaseVector<16>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 16;
   using                          type = m128;
-  using comparemask_type = u64a;
+  using              comparemask_type = u64a;
   static constexpr bool  has_previous = false;
   using                 previous_type = u64a;
   static constexpr u16  previous_size = 8;

From 849846700a757efb454ada64ee5851f548f94807 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Sun, 26 Jun 2022 23:02:02 +0000
Subject: [PATCH 03/35] Minor fix

---
 src/util/arch/arm/simd_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 8d8c4456..2a4f9c16 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -380,7 +380,7 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
 }
 
 static really_inline u32 movemask128(m128 a) {
-    ruint8x16_t input = vreinterpretq_u8_s32(a);
+    uint8x16_t input = vreinterpretq_u8_s32(a);
     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
     uint32x4_t paired16 =
         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));

From 7e7f604f7d5bbb860e570a2e3e70eab0cbac1550 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Sun, 26 Jun 2022 23:05:17 +0000
Subject: [PATCH 04/35] Fix ppc64el debug

---
 src/util/arch/ppc64el/match.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp
index 4f7cc7f1..bf71be2d 100644
--- a/src/util/arch/ppc64el/match.hpp
+++ b/src/util/arch/ppc64el/match.hpp
@@ -31,11 +31,11 @@ template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -48,8 +48,8 @@ template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -64,11 +64,11 @@ template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -82,11 +82,11 @@ template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
         u32 pos = clz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);

From db52ce6f086d7fa7e8cce29f06e31f19345c3ca0 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 20 Jul 2022 09:03:50 +0100
Subject: [PATCH 05/35] Fix avx512 movemask call

---
 unit/internal/supervector.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index 0b4cae58..2432e598 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -861,7 +861,7 @@ TEST(SuperVectorUtilsTest,Movemask512c){
     }
     auto SP = SuperVector<64>::loadu(vec);
     u8 vec2[64] = {0};
-    u64a mask = SP.movemask();
+    u64a mask = SP.comparemask();
     for(int i=0; i<64; i++) {
         if (mask & (1ULL << i)) {
             vec2[i] = 0xff;

From b5e1384995fc3cf214c8cfeccef9c5ca9e0b7f6a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-7-227.eu-west-1.compute.internal>
Date: Wed, 20 Jul 2022 13:26:52 +0000
Subject: [PATCH 06/35] Fixed the PCRE download location

---
 cmake/setenv-arm64-cross.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/setenv-arm64-cross.sh b/cmake/setenv-arm64-cross.sh
index 4858da1e..c9001699 100644
--- a/cmake/setenv-arm64-cross.sh
+++ b/cmake/setenv-arm64-cross.sh
@@ -9,11 +9,11 @@ export CROSS_SYS=<arm-cross-compiler-system-dir>
 # 	wget -O boost_$BOOST_VERSION.tar.gz https://sourceforge.net/projects/boost/files/boost/$BOOST_DOT_VERSION/boost_$BOOST_VERSION.tar.gz/download
 # 	tar xf boost_$BOOST_VERSION.tar.gz
 # fi
-if [ ! -d "pcre-8.41" ];
+if [ ! -d "pcre-8.45" ];
 then
-	wget -O pcre-8.41.tar.bz2 https://ftp.pcre.org/pub/pcre/pcre-8.41.tar.bz2
-	tar xf pcre-8.41.tar.bz2
+	wget -O pcre-8.45.tar.bz2 https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.bz2/download
+	tar xf pcre-8.45.tar.bz2
 	export PCRE_SOURCE=1
 fi
 
-export BOOST_PATH=<boost-source-dir>
\ No newline at end of file
+export BOOST_PATH=<boost-source-dir>

From cafd5248b11cbd98035286d64475b2c371aa4c87 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 4 Mar 2021 16:50:14 +0000
Subject: [PATCH 07/35] literal API: add instruction support

fixes github issue #303
---
 src/rose/program_runtime.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 7d4da45a..2bba5bbf 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -3092,6 +3092,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
 
     const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
     const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     const char *pc_base = getByOffset(t, programOffset);
     const char *pc = pc_base;
@@ -3188,6 +3189,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(SOM_FROM_REPORT) {
                 som = handleSomExternal(scratch, &ri->som, end);
                 DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
@@ -3195,6 +3207,15 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(DEDUPE) {
                 updateSeqPoint(tctxt, end, from_mpv);
                 const char do_som = t->hasSom; // TODO: constant propagate

From a119693a66504e671b73b6e96ef2bd9760647536 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 4 Mar 2021 17:00:34 +0000
Subject: [PATCH 08/35] mcclellan: improve wide-state checking in Sherman
 optimization

fixes github issue #305
---
 src/nfa/mcclellancompile.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index b5c3a8ac..aa04e470 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1081,7 +1081,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
         // Use the daddy already set for this state so long as it isn't already
         // a Sherman state.
         dstate_id_t daddy = currState.daddy;
-        if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
+        if (info.is_widestate(daddy)) {
+            return;
+        } else if (!info.is_sherman(daddy)) {
             hinted.insert(currState.daddy);
         } else {
             // Fall back to granddaddy, which has already been processed (due

From decabdfede6a3d3d846964795b8a45fbe63025ff Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 11 Mar 2021 15:20:55 +0000
Subject: [PATCH 09/35] update year for bugfix #302-#305

---
 src/compiler/compiler.cpp    | 2 +-
 src/nfa/mcclellancompile.cpp | 2 +-
 src/rose/program_runtime.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 5751bd64..ae5927bc 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index aa04e470..055920b2 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 2bba5bbf..f607e8f2 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:

From c1659b854437c4fa92cc2693b6c854cc2c4a4277 Mon Sep 17 00:00:00 2001
From: "Chang, Harry" <harry.chang@intel.com>
Date: Wed, 10 Mar 2021 07:20:01 +0000
Subject: [PATCH 10/35] Logical Combination: bypass combination flag in
 hs_expression_info.

Fixes github issue #291
---
 src/hs.cpp        |  8 +++++++-
 src/hs_compile.h  | 12 +++---------
 src/hs_internal.h |  6 ++++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/hs.cpp b/src/hs.cpp
index 303e7838..73cc032f 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -517,6 +517,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         return HS_COMPILER_ERROR;
     }
 
+    if (flags & HS_FLAG_COMBINATION) {
+        *error = generateCompileError("Invalid parameter: unsupported "
+                                      "logical combination expression", -1);
+        return HS_COMPILER_ERROR;
+    }
+
     *info = nullptr;
     *error = nullptr;
 
diff --git a/src/hs_compile.h b/src/hs_compile.h
index b318c29d..5aa24188 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param info
  *      On success, a pointer to the pattern information will be returned in
@@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param ext
  *      A pointer to a filled @ref hs_expr_ext_t structure that defines
diff --git a/src/hs_internal.h b/src/hs_internal.h
index adf07b22..4eb5e157 100644
--- a/src/hs_internal.h
+++ b/src/hs_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Intel Corporation
+ * Copyright (c) 2019-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,7 +80,9 @@ extern "C"
                     | HS_FLAG_PREFILTER \
                     | HS_FLAG_SINGLEMATCH \
                     | HS_FLAG_ALLOWEMPTY \
-                    | HS_FLAG_SOM_LEFTMOST)
+                    | HS_FLAG_SOM_LEFTMOST \
+                    | HS_FLAG_COMBINATION \
+                    | HS_FLAG_QUIET)
 
 #ifdef __cplusplus
 } /* extern "C" */

From 2731a3384bbd7ffc4933f6d43478ef2762e5b4d8 Mon Sep 17 00:00:00 2001
From: hongyang7 <yang.a.hong@intel.com>
Date: Thu, 16 Dec 2021 19:02:17 +0800
Subject: [PATCH 11/35] Fix segfaults on allocation failure (#4)

Throw std::bad_alloc instead of returning nullptr from
ue2::AlignedAllocator. Allocators for STL containers are expected never
to return with an invalid pointer, and instead must throw on failure.
Violating this expectation can lead to invalid pointer dereferences.

Co-authored-by: johanngan <johanngan.us@gmail.com>

fixes github issue #317 (PR #320)
---
 src/util/alloc.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/util/alloc.h b/src/util/alloc.h
index de20c8d0..49b4a824 100644
--- a/src/util/alloc.h
+++ b/src/util/alloc.h
@@ -76,7 +76,11 @@ public:
 
     T *allocate(std::size_t size) const {
         size_t alloc_size = size * sizeof(T);
-        return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        if (!ptr) {
+            throw std::bad_alloc();
+        }
+        return ptr;
     }
 
     void deallocate(T *x, std::size_t) const noexcept {

From 4d4940dfbe523589e4ea90033bda4c574c73d627 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 28 Apr 2022 10:11:32 +0000
Subject: [PATCH 12/35] bugfix: fix overflow risk of strlen function

---
 src/compiler/compiler.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index ae5927bc..32836834 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
     }
 
     // Ensure that our pattern isn't too long (in characters).
-    if (strlen(expression) > cc.grey.limitPatternLength) {
+    size_t maxlen = cc.grey.limitPatternLength + 1;
+    if (strnlen(expression, maxlen) >= maxlen) {
         throw CompileError("Pattern length exceeds limit.");
     }
 

From a9ca0e4de36ff32fb4a28f1bdc74ef08dc3f1ca4 Mon Sep 17 00:00:00 2001
From: "Chang, Harry" <harry.chang@intel.com>
Date: Thu, 12 May 2022 02:15:07 +0000
Subject: [PATCH 13/35] Corpus generator: fix random char value of UTF-8.

fixes github issue #184
---
 util/ng_corpus_generator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 145a0ab8..6c3f613d 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -476,14 +476,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
  * that we've been asked for. */
 unichar CorpusGeneratorUtf8::getRandomChar() {
     u32 range = MAX_UNICODE + 1
-                - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+                - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     range = min(cProps.alphabetSize, range);
     assert(range);
 
     unichar c = 'a' + cProps.rand(0, range - 1);
 
     if (c >= UNICODE_SURROGATE_MIN) {
-        c =+ UNICODE_SURROGATE_MAX + 1;
+        c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 
     return c % (MAX_UNICODE + 1);

From 31afacc7be282ac591e71564bfee794303a244fa Mon Sep 17 00:00:00 2001
From: "Chang, Harry" <harry.chang@intel.com>
Date: Thu, 12 May 2022 08:20:29 +0000
Subject: [PATCH 14/35] Corpus editor: fix random char value of UTF-8.

---
 util/ng_corpus_editor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/ng_corpus_editor.cpp b/util/ng_corpus_editor.cpp
index ac4f8b65..c1149216 100644
--- a/util/ng_corpus_editor.cpp
+++ b/util/ng_corpus_editor.cpp
@@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
 unichar CorpusEditorUtf8::chooseCodePoint(void) {
     /* We need to ensure that we don't pick a surrogate cp */
     const u32 range =
-        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     unichar raw = props.rand(0, range - 1);
     if (raw < UNICODE_SURROGATE_MIN) {
         return raw;
     } else {
-        return raw + UNICODE_SURROGATE_MAX + 1;
+        return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 }
 

From 4f27a70dd7c4c48d259a77bf22bfd7dfa51b1d7e Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 28 Jul 2022 04:59:34 +0000
Subject: [PATCH 15/35] chimera: fix SKIP flag issue

fix github issue #360
---
 chimera/ch_runtime.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c
index fdb5b992..1009036b 100644
--- a/chimera/ch_runtime.c
+++ b/chimera/ch_runtime.c
@@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
         } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
             DEBUG_PRINTF("user callback told us to skip this pattern\n");
             pd->scanStart = hyctx->length;
+            if (top_id == id) {
+                break;
+            }
+            continue;
         }
 
         if (top_id == id) {

From 70b2a28386f6a4be7903d9d61836c5918d219652 Mon Sep 17 00:00:00 2001
From: "Hong, Yang A" <yang.a.hong@intel.com>
Date: Thu, 4 Mar 2021 16:13:46 +0000
Subject: [PATCH 16/35] literal API: add empty string check.

fixes github issue #302, #304
---
 src/compiler/compiler.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 32836834..35f46b3f 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -417,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
                            "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
     }
 
+    if (!strcmp(expression, "")) {
+        throw CompileError("Pure literal API doesn't support empty string.");
+    }
+
     // This expression must be a pure literal, we can build ue2_literal
     // directly based on expression text.
     ParsedLitExpression ple(index, expression, expLength, flags, id);

From c597f69c5910db5042cf1942de64416ed41cd5f4 Mon Sep 17 00:00:00 2001
From: Liu Zixian <hdu_sdlzx@163.com>
Date: Mon, 27 Jun 2022 16:07:16 +0800
Subject: [PATCH 17/35] fix build with glibc-2.34

SIGTSKSZ is no long a constant after glibc 2.34
https://sourceware.org/pipermail/libc-alpha/2021-August/129718.html
---
 tools/hscollider/sig.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp
index bb00185d..d2e221b5 100644
--- a/tools/hscollider/sig.cpp
+++ b/tools/hscollider/sig.cpp
@@ -38,6 +38,7 @@
 
 #if defined(HAVE_SIGACTION)
 #include <signal.h>
+#define STACK_SIZE 8192
 #endif
 
 #ifdef HAVE_BACKTRACE
@@ -166,7 +167,7 @@ void installSignalHandler(void) {
 }
 
 #ifdef HAVE_SIGALTSTACK
-static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
+static TLS_VARIABLE char alt_stack_loc[STACK_SIZE];
 #endif
 
 void setSignalStack(void) {
@@ -178,7 +179,7 @@ void setSignalStack(void) {
     stack_t alt_stack;
     memset(&alt_stack, 0, sizeof(alt_stack));
     alt_stack.ss_flags = 0;
-    alt_stack.ss_size = SIGSTKSZ;
+    alt_stack.ss_size = STACK_SIZE;
     alt_stack.ss_sp = alt_stack_loc;
     if (!sigaltstack(&alt_stack, nullptr)) {
         act.sa_flags |= SA_ONSTACK;

From 74ab41897cc1d4f03555e5adde679fe21c60ee0a Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 30 Aug 2022 20:40:23 +0300
Subject: [PATCH 18/35] Add missing <memory> header

---
 unit/internal/multi_bit_compress.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp
index 2d59ea14..40078f81 100644
--- a/unit/internal/multi_bit_compress.cpp
+++ b/unit/internal/multi_bit_compress.cpp
@@ -28,6 +28,8 @@
 
 #include "config.h"
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "ue2common.h"
 #include "util/compile_error.h"

From 43c053a069848fbbd6f92f860dc035bd17bc3627 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 2 Sep 2022 15:12:56 +0300
Subject: [PATCH 19/35] add popcount32x4, popcount64x4 helper functions

---
 src/util/bitfield.h |  5 +----
 src/util/popcount.h | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/util/bitfield.h b/src/util/bitfield.h
index a580da7b..202232b6 100644
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -189,10 +189,7 @@ public:
         size_t sum = 0;
         size_t i = 0;
         for (; i + 4 <= num_blocks; i += 4) {
-            sum += popcount64(bits[i]);
-            sum += popcount64(bits[i + 1]);
-            sum += popcount64(bits[i + 2]);
-            sum += popcount64(bits[i + 3]);
+            sum += popcount64x4(&bits[i]);
         }
         for (; i < num_blocks; i++) {
             sum += popcount64(bits[i]);
diff --git a/src/util/popcount.h b/src/util/popcount.h
index c7a69d46..d90a0d50 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -52,6 +52,15 @@ u32 popcount32(u32 x) {
 // #endif
 }
 
+static really_inline
+u32 popcount32x4(u32 const *x) {
+    u32 sum = popcount32(x[0]);
+    sum += popcount32(x[1]);
+    sum += popcount32(x[2]);
+    sum += popcount32(x[3]);
+    return sum;
+}
+
 static really_inline
 u32 popcount64(u64a x) {
     return __builtin_popcountll(x);
@@ -73,5 +82,14 @@ u32 popcount64(u64a x) {
 // #endif
 }
 
+static really_inline
+u32 popcount64x4(u64a const *x) {
+    volatile u32 sum = popcount64(x[0]);
+    sum += popcount64(x[1]);
+    sum += popcount64(x[2]);
+    sum += popcount64(x[3]);
+    return sum;
+}
+
 #endif /* UTIL_POPCOUNT_H_ */
 

From 026f7616714896f314273c9732daefefb92590dd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Sep 2022 18:10:55 +0300
Subject: [PATCH 20/35] [VSX] optimized mask1bit128(), moved simd_onebit_masks
 to common

---
 src/util/arch/common/simd_utils.h  | 18 ++++++++++++
 src/util/arch/ppc64el/simd_utils.h | 44 ++++--------------------------
 2 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 17de949a..2f2dcf7c 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -88,6 +88,24 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
+
 /****
  **** 256-bit Primitives
  ****/
diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index d046ed47..ce67dae2 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -54,34 +54,6 @@ typedef __vector  signed char             int8x16_t;
 
 typedef unsigned long long int ulong64_t;
 typedef   signed long long int  long64_t;
-/*
-typedef __vector  uint64_t uint64x2_t;
-typedef __vector   int64_t  int64x2_t;
-typedef __vector  uint32_t uint32x4_t;
-typedef __vector   int32_t  int32x4_t;
-typedef __vector  uint16_t uint16x8_t;
-typedef __vector   int16_t  int16x8_t;
-typedef __vector   uint8_t uint8x16_t;
-typedef __vector    int8_t  int8x16_t;*/
-
-
-#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
-
-/** \brief LUT for the mask1bit functions. */
-ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
-    ZEROES_32, ZEROES_32,
-    ZEROES_31, 0x01, ZEROES_32,
-    ZEROES_31, 0x02, ZEROES_32,
-    ZEROES_31, 0x04, ZEROES_32,
-    ZEROES_31, 0x08, ZEROES_32,
-    ZEROES_31, 0x10, ZEROES_32,
-    ZEROES_31, 0x20, ZEROES_32,
-    ZEROES_31, 0x40, ZEROES_32,
-    ZEROES_31, 0x80, ZEROES_32,
-    ZEROES_32, ZEROES_32,
-};
 
 static really_inline m128 ones128(void) {
     return (m128) vec_splat_u8(-1);
@@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
     m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
     mask = vec_and(not128(mask), movemask);
     m128 sum = vec_sums(mask, zeroes128()); 
-    //sum = vec_sld(zeroes128(), sum, 4); 
-    //s32 ALIGN_ATTR(16) x;
-    //vec_ste(sum, 0, &x);   
-    //return x;   // it could be ~(movemask_128(mask)) & 0x;
     return sum[3];
 }
 
@@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
     uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
     mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
     m128 sum = vec_sums((m128)mask, zeroes128());
-    //sum = vec_sld(zeroes128(), sum, 4);
-    //s32 ALIGN_ATTR(16) x;
-    //vec_ste(sum, 0, &x);
-    //return x;
     return sum[3];
 }
 
@@ -425,9 +389,11 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
 static really_inline
 m128 mask1bit128(unsigned int n) {
     assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
+    static uint64x2_t onebit = { 1, 0 };
+    m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
+    m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
+    m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
+    return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
 }
 
 // switches on bit N in the given vector.

From 0e7874f122a55da0b2b92a129f5610e352594be6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Sep 2022 18:46:39 +0300
Subject: [PATCH 21/35] [VSX] optimize and correct lshift_m128/rshift_m128

---
 src/util/arch/ppc64el/simd_utils.h | 44 ++++++------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index ce67dae2..589c4031 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -114,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) {
 
 static really_really_inline
 m128 lshift_m128(m128 a, unsigned b) {
-    switch(b){
-    case 1: return vec_sld(a, zeroes128(), 1); break;	    
-    case 2: return vec_sld(a, zeroes128(), 2); break;	    
-    case 3: return vec_sld(a, zeroes128(), 3); break;	    
-    case 4: return vec_sld(a, zeroes128(), 4); break;	    
-    case 5: return vec_sld(a, zeroes128(), 5); break;	    
-    case 6: return vec_sld(a, zeroes128(), 6); break;	    
-    case 7: return vec_sld(a, zeroes128(), 7); break;	    
-    case 8: return vec_sld(a, zeroes128(), 8); break;	    
-    case 9: return vec_sld(a, zeroes128(), 9); break;	    
-    case 10: return vec_sld(a, zeroes128(), 10); break;	    
-    case 11: return vec_sld(a, zeroes128(), 11); break;	    
-    case 12: return vec_sld(a, zeroes128(), 12); break;	    
-    case 13: return vec_sld(a, zeroes128(), 13); break;	    
-    case 14: return vec_sld(a, zeroes128(), 14); break;	   
-    case 15: return vec_sld(a, zeroes128(), 15); break;
-    }	
-    return a;
+    if (b == 0) return a;
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
+    m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
+    return result;
 }
 
 static really_really_inline
 m128 rshift_m128(m128 a, unsigned b) {
-   switch(b){ 
-    case 1: return vec_sld(zeroes128(), a, 15); break;	    
-    case 2: return vec_sld(zeroes128(), a, 14); break;	    
-    case 3: return vec_sld(zeroes128(), a, 13); break;	    
-    case 4: return vec_sld(zeroes128(), a, 12); break;	    
-    case 5: return vec_sld(zeroes128(), a, 11); break;	    
-    case 6: return vec_sld(zeroes128(), a, 10); break;	    
-    case 7: return vec_sld(zeroes128(), a, 9); break;	    
-    case 8: return vec_sld(zeroes128(), a, 8); break;	    
-    case 9: return vec_sld(zeroes128(), a, 7); break;	    
-    case 10: return vec_sld(zeroes128(), a, 6); break;	    
-    case 11: return vec_sld(zeroes128(), a, 5); break;	    
-    case 12: return vec_sld(zeroes128(), a, 4); break;	    
-    case 13: return vec_sld(zeroes128(), a, 3); break;	    
-    case 14: return vec_sld(zeroes128(), a, 2); break;	    
-    case 15: return vec_sld(zeroes128(), a, 1); break;	    
-   }
-   return a;
+    if (b == 0) return a;
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
+    m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl);
+    return result;
 }
 
 static really_really_inline

From 17467ff21bb7df033814968c75b2b91a429c62a8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Sep 2022 20:08:44 +0300
Subject: [PATCH 22/35] [VSX] huge optimization of movemask128

---
 src/util/arch/ppc64el/simd_utils.h | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index 589c4031..44c9122c 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -148,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
    return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
 }
 
-
 static really_inline u32 movemask128(m128 a) {
-   uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));
-   
-   uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
-   uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
-   uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
-  
-   uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
-   uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
-   uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
-   
-   uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
-   uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
-  
-   uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
-   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
-   uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
- 
-   return s5[0];
+   static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+   uint8x16_t bitmask = vec_gb((uint8x16_t) a);
+   bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
+   u32 movemask;
+   vec_ste((uint32x4_t) bitmask, 0, &movemask);
+   return movemask;
 }
 
 static really_inline m128 set1_16x8(u8 c) {

From 94fe406f0c24a7996b12ee5a18378833c9fd813c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Sep 2022 23:39:44 +0300
Subject: [PATCH 23/35] [VSX] correct lshiftbyte_m128/rshiftbyte_m128,
 variable_byte_shift

---
 src/util/arch/ppc64el/simd_utils.h | 13 ++++----
 unit/internal/simd_utils.cpp       | 51 +++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index 44c9122c..32014e54 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -285,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
-
 #define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
 
 static really_really_inline
@@ -326,21 +325,21 @@ m128 palignr(m128 r, m128 l, int offset) {
 
 static really_really_inline
 m128 rshiftbyte_m128(m128 a, unsigned b) {
-   return rshift_m128(a,b);
+    return palignr_imm(zeroes128(), a, b);
 }
 
 static really_really_inline
 m128 lshiftbyte_m128(m128 a, unsigned b) {
-   return lshift_m128(a,b);
+    return palignr_imm(a, zeroes128(), 16 - b);
 }
 
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {
     assert(amount >= -16 && amount <= 16);
-    if (amount < 0){
-	    return palignr_imm(zeroes128(), in, -amount);
-    } else{
-	    return palignr_imm(in, zeroes128(), 16 - amount);
+    if (amount < 0) {
+        return rshiftbyte_m128(in, -amount);
+    } else {
+        return lshiftbyte_m128(in, amount);
     }
 }
 
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 69f1a64c..c5cfec7b 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) {
 }
 #endif
 
+#define TEST_LSHIFTBYTE128(v1, buf, l) {                                                 \
+                                           m128 v_shifted = lshiftbyte_m128(v1, l);      \
+                                           storeu128(res, v_shifted);                    \
+                                           int i;                                        \
+                                           for (i=0; i < l; i++) {                       \
+                                               assert(res[i] == 0);                      \
+                                           }                                             \
+                                           for (; i < 16; i++) {                         \
+                                               assert(res[i] == vec[i - l]);             \
+                                           }                                             \
+                                       }
+
+TEST(SimdUtilsTest, lshiftbyte128){
+    u8 vec[16];
+    u8 res[16];
+    for (int i=0; i<16; i++) {
+        vec[i]=i;
+    }
+    m128 v1 = loadu128(vec);
+    for (int j = 0; j<16; j++){
+        TEST_LSHIFTBYTE128(v1, vec, j);
+    }
+}
+
+#define TEST_RSHIFTBYTE128(v1, buf, l) {                                                 \
+                                           m128 v_shifted = rshiftbyte_m128(v1, l);      \
+                                           storeu128(res, v_shifted);                    \
+                                           int i;                                        \
+                                           for (i=15; i >= 16 - l; i--) {                \
+                                               assert(res[i] == 0);                      \
+                                           }                                             \
+                                           for (; i >= 0; i--) {                         \
+                                               assert(res[i] == vec[i + l]);             \
+                                           }                                             \
+                                       }
+
+TEST(SimdUtilsTest, rshiftbyte128){
+    u8 vec[16];
+    u8 res[16];
+    for (int i=0; i<16; i++) {
+        vec[i]=i;
+    }
+    m128 v1 = loadu128(vec);
+    for (int j = 0; j<16; j++){
+        TEST_RSHIFTBYTE128(v1, vec, j);
+    }
+}
+
 TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
+
     EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
     EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
@@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) {
     EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, 10)));
 
-    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15)));
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
 }
 

From 7295b9c718c1716ad2ec161f7be15fddeafcd737 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 00:01:54 +0300
Subject: [PATCH 24/35] [VSX] add algorithm for alignr w/o use of immediates

---
 src/util/arch/ppc64el/simd_utils.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index 32014e54..ea1766b2 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -313,12 +313,18 @@ m128 palignr_imm(m128 r, m128 l, int offset) {
 
 static really_really_inline
 m128 palignr(m128 r, m128 l, int offset) {
-#if defined(HS_OPTIMIZE)
-    // need a faster way to do this.
-    return palignr_imm(r, l, offset);
-#else
-    return palignr_imm(r, l, offset);
+    if (offset == 0) return l;
+    if (offset == 16) return r;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
+    }
 #endif
+    m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
+    m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
+    m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
+    m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
+    return or128(lhs, rhs);
 }
 
 #undef CASE_ALIGN_VECTORS

From dc6b8ae92db27e9d9bd19a427f0128cb7ef6fc9b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 02:02:11 +0300
Subject: [PATCH 25/35] optimize comparemask implementation, clean up code, use
 union types instead of casts

---
 src/util/supervector/arch/ppc64el/impl.cpp | 160 +++++++++++++--------
 src/util/supervector/supervector.hpp       |  10 +-
 2 files changed, 108 insertions(+), 62 deletions(-)

diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 5becb8f8..7903bee2 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -39,7 +39,7 @@
 #include "util/supervector/supervector.hpp"
 #include <iostream>
 
-// 128-bit Powerpc64le implementation
+// 128-bit IBM Power VSX implementation
 
 template<>
 really_inline SuperVector<16>::SuperVector(SuperVector const &other)
@@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other)
     u.v128[0] = other.u.v128[0];
 }
 
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(char __bool __vector v)
+{
+    u.u8x16[0] = (uint8x16_t) v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8x16_t const v)
+{
+    u.s8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
+{
+    u.u8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16x8_t const v)
+{
+    u.s16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
+{
+    u.u16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32x4_t const v)
+{
+    u.s32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
+{
+    u.u32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64x2_t const v)
+{
+    u.s64x2[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
+{
+    u.u64x2[0] = v;
+};
+
 template<>
 really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 {
@@ -57,69 +120,69 @@ template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s8x16[0] = vec_splats(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
+    u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s16x8[0] = vec_splats(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
+    u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s32x4[0] = vec_splats(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
+    u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.s64x2[0] = (int64x2_t) vec_splats(static_cast<ulong64_t>(other));
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.u64x2[0] = (uint64x2_t) vec_splats(static_cast<ulong64_t>(other));
 }
 
 // Constants
 template<>
 really_inline SuperVector<16> SuperVector<16>::Ones(void)
 {
-    return  {(m128) vec_splat_s8(-1)};
+    return  { vec_splat_s8(-1)};
 }
 
 template<>
 really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
 {
-    return  {(m128) vec_splat_s8(0)};
+    return  { vec_splat_s8(0) };
 }
 
 // Methods
@@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
 {
-    return {vec_and(u.v128[0], b.u.v128[0])};
+    return { vec_and(u.v128[0], b.u.v128[0]) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
 {
-    return  {vec_or(u.v128[0], b.u.v128[0])};
+    return  { vec_or(u.v128[0], b.u.v128[0]) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
 {
-    return  {(m128) vec_xor(u.v128[0], b.u.v128[0])};
+    return  { vec_xor(u.v128[0], b.u.v128[0]) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator!() const
 {
-    return  {(m128) vec_xor(u.v128[0], u.v128[0])};
+    return  { vec_xor(u.v128[0], u.v128[0]) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
 {
-   m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
-   return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
+   int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
+   return { vec_and(not_res, b.u.s8x16[0]) };
 }
 
-
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
+    return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
 }
 
 template <>
@@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
 { 
-    return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; 
+    return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};  
+    return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};  
+    return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
 {   
-    return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};   
+    return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
 }
 
-
 template <>
 really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
 {
@@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 template <>
 really_inline typename SuperVector<16>::comparemask_type
 SuperVector<16>::comparemask(void) const {
-    uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
-    
-    uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
-    uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
-    uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
-    
-    uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
-    uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
-    uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
-
-    uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
-    uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
-
-    uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
-    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
-    uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
-    
-    return s5[0];
+    uint8x16_t bitmask = vec_gb( u.u8x16[0]);
+    static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
+    u32 movemask;
+    vec_ste((uint32x4_t) bitmask, 0, &movemask);
+    return movemask;
 }
 
 template <>
@@ -248,35 +296,35 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
 {
-    return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; 
+    return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
 {
-    return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
+    return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
 {
-    return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
 {
-    return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
+    return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
 {
-    return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; 
+    return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
 }
 
 template <>
@@ -290,35 +338,35 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
 {
-    return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
+    return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
 {
-    return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; 
+    return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
 {
-    return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
 {		 
-   return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; 
+   return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
 }
 
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
 {   
-    return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) };	
+    return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
 }
 
 template <>
@@ -535,9 +583,7 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
     SuperVector<16> mask = Ones_vshr(16 -len);
-    mask.print8("mask");
     SuperVector<16> v = loadu(ptr);
-    v.print8("v");
     return mask & v;
 }
 
@@ -574,9 +620,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
     /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
        In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
        below is the version that is converted from Intel to PPC.  */
-    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
+    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
     uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
-    return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
+    return { vec_sel(res, vec_splat_u8(0), mask) };
 }
 
 template<>
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 5d066c1a..fef5f09f 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -177,13 +177,13 @@ public:
 
 #if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
     uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
-    int64x2_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
+    int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
-    int32x4_t  ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
+    int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
     uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
-    int16x8_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
+    int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
     uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
-    int8x16_t  ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
+    int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
 #endif
 
     uint64_t u64[SIZE / sizeof(uint64_t)];
@@ -204,7 +204,7 @@ public:
   SuperVector(typename base_type::type const v);
 
   template<typename T>
-  SuperVector(T other);
+  SuperVector(T const other);
 
   SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
   SuperVector(previous_type const lo, previous_type const hi);

From be20c2c519b4afde108db21a90296410db933ed9 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 11:52:08 +0300
Subject: [PATCH 26/35] [VSX] optimize shifting methods, replace template
 Unroller

---
 src/util/supervector/arch/ppc64el/impl.cpp | 62 ++++++++--------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 7903bee2..94aa6a32 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -396,50 +396,40 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
-    return result;
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
+    return { vec_sl(u.u8x16[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result; 
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
-    return result;
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
+    return { vec_sl(u.u16x8[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
-    return result;
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
+    return { vec_sl(u.u32x4[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
-    return result;
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
+    return { vec_sl(u.u64x2[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
-    return result;
+    SuperVector sl{N << 3};
+    return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) };
 }
 
 template <>
@@ -452,50 +442,40 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
-    return result;
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
+    return { vec_sr(u.u8x16[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
-    return result;
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
+    return { vec_sr(u.u16x8[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
-    return result;
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
+    return { vec_sr(u.u32x4[0], shift_indices) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
-    return result;
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
+    return { vec_sr(u.u64x2[0], shift_indices) };
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
-    return result;
+    SuperVector sr{N << 3};
+    return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) };
 }
 
 template <>

From a837cf3bee355ab082e948d157c0eece66d46acc Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 12:16:14 +0300
Subject: [PATCH 27/35] [VSX] optimize shift operators

---
 src/util/supervector/arch/ppc64el/impl.cpp | 50 ++++++----------------
 1 file changed, 12 insertions(+), 38 deletions(-)

diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 94aa6a32..90847a0c 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -487,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-    switch(N) {
-    case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 15)}; break;
-    case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 14)}; break;
-    case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 13)}; break;
-    case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 12)}; break;
-    case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 11)}; break;
-    case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 10)}; break;
-    case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  9)}; break;
-    case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  8)}; break;
-    case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  7)}; break;
-    case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break;
-    case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break;
-    case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break;
-    case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break;
-    case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break;
-    case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (N == 0) return *this;
+    if (__builtin_constant_p(N)) {
+        return { vec_sld(vec_splat_s8(0),  u.s8x16[0], 16 - N) };
     }
-    return *this;
+#endif
+    return vshr_128(N);
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-    switch(N) {
-    case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break;
-    case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break;
-    case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break;
-    case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break;
-    case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break;
-    case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break;
-    case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break;
-    case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break;
-    case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break;
-    case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break;
-    case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break;
-    case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break;
-    case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break;
-    case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break;
-    case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (N == 0) return *this;
+    if (__builtin_constant_p(N)) {
+        return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
     }
-    return *this;
+#endif
+    return vshl_128(N);
 }
 
 template<>

From 305a041c737b882b17c609ca54faf39bf37788bd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 12:35:28 +0300
Subject: [PATCH 28/35] [VSX] optimize alignr method

---
 src/util/supervector/arch/ppc64el/impl.cpp | 35 ++++++++--------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 90847a0c..2eba69b2 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -523,14 +523,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
-    return (m128) vec_xl(0, (const long64_t*)ptr);
+    return { vec_xl(0, (const long64_t*)ptr) };
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
-    return (m128)  vec_xl(0, (const long64_t*)ptr);
+    return { vec_xl(0, (const long64_t*)ptr) };
 }
 
 template <>
@@ -544,27 +544,18 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {   
-    
-    switch(offset) {
-    case 0: return other; break;
-    case 1: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 15)}; break;
-    case 2: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 14)}; break;
-    case 3: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 13)}; break;
-    case 4: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 12)}; break;
-    case 5: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 11)}; break;
-    case 6: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 10)}; break;
-    case 7: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  9)}; break;
-    case 8: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  8)}; break;
-    case 9: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  7)}; break;
-    case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break;
-    case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break;
-    case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break;
-    case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break;
-    case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break;
-    case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break;
-    default: break;
+    if (offset == 0) return other;
+    if (offset == 16) return *this;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) };
     }
-    return *this;
+#endif
+    uint8x16_t sl = vec_splats((uint8_t) (offset << 3));
+    uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3));
+    uint8x16_t rhs = vec_slo(u.u8x16[0], sr);
+    uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl);
+    return { vec_or(lhs, rhs) };
 }
 
 template<>

From 02ae2a3cad3410129a98d4f530f3f3b316e24c29 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 12:41:32 +0300
Subject: [PATCH 29/35] remove simd_onebit_masks from arm/x86 headers, as they
 moved to common

---
 src/util/arch/arm/simd_utils.h | 18 ------------------
 src/util/arch/x86/simd_utils.h | 18 ------------------
 2 files changed, 36 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 2a4f9c16..6447996c 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -53,24 +53,6 @@
 
 #include <string.h> // for memcpy
 
-#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
-
-/** \brief LUT for the mask1bit functions. */
-ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
-    ZEROES_32, ZEROES_32,
-    ZEROES_31, 0x01, ZEROES_32,
-    ZEROES_31, 0x02, ZEROES_32,
-    ZEROES_31, 0x04, ZEROES_32,
-    ZEROES_31, 0x08, ZEROES_32,
-    ZEROES_31, 0x10, ZEROES_32,
-    ZEROES_31, 0x20, ZEROES_32,
-    ZEROES_31, 0x40, ZEROES_32,
-    ZEROES_31, 0x80, ZEROES_32,
-    ZEROES_32, ZEROES_32,
-};
-
 static really_inline m128 ones128(void) {
     return (m128) vdupq_n_s8(0xFF);
 }
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index c4a3b97c..d432251f 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -42,24 +42,6 @@
 
 #include <string.h> // for memcpy
 
-#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
-
-/** \brief LUT for the mask1bit functions. */
-ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
-    ZEROES_32, ZEROES_32,
-    ZEROES_31, 0x01, ZEROES_32,
-    ZEROES_31, 0x02, ZEROES_32,
-    ZEROES_31, 0x04, ZEROES_32,
-    ZEROES_31, 0x08, ZEROES_32,
-    ZEROES_31, 0x10, ZEROES_32,
-    ZEROES_31, 0x20, ZEROES_32,
-    ZEROES_31, 0x40, ZEROES_32,
-    ZEROES_31, 0x80, ZEROES_32,
-    ZEROES_32, ZEROES_32,
-};
-
 static really_inline m128 ones128(void) {
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
     /* gcc gets this right */

From 0af2ba86165c469361fbfd9f34fd70aa2a53213d Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 10:20:01 +0000
Subject: [PATCH 30/35] [NEON] optimize mask1bit128, get rid of
 simd_onebit_masks

---
 src/util/arch/arm/simd_utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 6447996c..45bcd23c 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -577,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
 static really_inline
 m128 mask1bit128(unsigned int n) {
     assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
+    static m128 onebit = { 1, 0 };
+    m128 mask = lshiftbyte_m128( onebit, n / 8 );
+    return lshift64_m128( mask, n % 8 );
 }
 
 // switches on bit N in the given vector.

From 1ae0d151812fd7627ae921632af49309b14c22ae Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 13:42:25 +0300
Subject: [PATCH 31/35] readd simd_onebit_masks for x86, needs more work

---
 src/util/arch/common/simd_utils.h |  2 ++
 src/util/arch/x86/simd_utils.h    | 26 ++++++++++++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 2f2dcf7c..90ae80b0 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -88,6 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
@@ -105,6 +106,7 @@ ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
     ZEROES_31, 0x80, ZEROES_32,
     ZEROES_32, ZEROES_32,
 };
+#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 
 /****
  **** 256-bit Primitives
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index d432251f..f732e3b8 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -42,6 +42,24 @@
 
 #include <string.h> // for memcpy
 
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
+
 static really_inline m128 ones128(void) {
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
     /* gcc gets this right */
@@ -237,14 +255,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     memcpy(&a, ptr, n);
     return a;
 }
-/*
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const u8 simd_onebit_masks[];
-#ifdef __cplusplus
-}
-#endif*/
 
 static really_inline
 m128 mask1bit128(unsigned int n) {

From 756ef409b400cabb66ae55d44971593fe85607d7 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 15:07:20 +0300
Subject: [PATCH 32/35] provide non-immediate versions of lshiftbyte/rshiftbyte
 on x86

---
 src/util/arch/x86/simd_utils.h | 65 ++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index f732e3b8..d3d07f79 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) {
     return _mm_set_epi64x(0LL, *p);
 }
 
-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+#define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 rshiftbyte_m128(const m128 a, int count_immed) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(count_immed)) {
+        return _mm_srli_si128(a, count_immed);
+    }
+#endif
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_RSHIFT_VECTOR(a, 1);
+    CASE_RSHIFT_VECTOR(a, 2);
+    CASE_RSHIFT_VECTOR(a, 3);
+    CASE_RSHIFT_VECTOR(a, 4);
+    CASE_RSHIFT_VECTOR(a, 5);
+    CASE_RSHIFT_VECTOR(a, 6);
+    CASE_RSHIFT_VECTOR(a, 7);
+    CASE_RSHIFT_VECTOR(a, 8);
+    CASE_RSHIFT_VECTOR(a, 9);
+    CASE_RSHIFT_VECTOR(a, 10);
+    CASE_RSHIFT_VECTOR(a, 11);
+    CASE_RSHIFT_VECTOR(a, 12);
+    CASE_RSHIFT_VECTOR(a, 13);
+    CASE_RSHIFT_VECTOR(a, 14);
+    CASE_RSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_RSHIFT_VECTOR
+
+#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 lshiftbyte_m128(const m128 a, int count_immed) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(count_immed)) {
+        return _mm_slli_si128(a, count_immed);
+    }
+#endif
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_LSHIFT_VECTOR(a, 1);
+    CASE_LSHIFT_VECTOR(a, 2);
+    CASE_LSHIFT_VECTOR(a, 3);
+    CASE_LSHIFT_VECTOR(a, 4);
+    CASE_LSHIFT_VECTOR(a, 5);
+    CASE_LSHIFT_VECTOR(a, 6);
+    CASE_LSHIFT_VECTOR(a, 7);
+    CASE_LSHIFT_VECTOR(a, 8);
+    CASE_LSHIFT_VECTOR(a, 9);
+    CASE_LSHIFT_VECTOR(a, 10);
+    CASE_LSHIFT_VECTOR(a, 11);
+    CASE_LSHIFT_VECTOR(a, 12);
+    CASE_LSHIFT_VECTOR(a, 13);
+    CASE_LSHIFT_VECTOR(a, 14);
+    CASE_LSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_LSHIFT_VECTOR
 
 #if defined(HAVE_SSE41)
 #define extract32from128(a, imm) _mm_extract_epi32(a, imm)
@@ -322,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) {
 	    break;
     }
 }
+#undef CASE_ALIGN_VECTORS
 
 static really_really_inline
 m128 palignr(m128 r, m128 l, int offset) {
@@ -332,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) {
 #endif
     return palignr_sw(r, l, offset);
 }
-#undef CASE_ALIGN_VECTORS
 
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {

From e3c237a7e055a0cf885712ca9ab9d907eb6bb18e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Sep 2022 16:00:10 +0300
Subject: [PATCH 33/35] use correct intrinsic for lshiftbyte_m128

---
 src/util/arch/x86/simd_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index d3d07f79..924a91c6 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -196,7 +196,7 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) {
 }
 #undef CASE_RSHIFT_VECTOR
 
-#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
+#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
 
 static really_inline
 m128 lshiftbyte_m128(const m128 a, int count_immed) {

From f4840adf3d6ff539241e2db3548b96a96585b138 Mon Sep 17 00:00:00 2001
From: liquidaty <info@liquidaty.com>
Date: Thu, 8 Sep 2022 09:59:37 -0700
Subject: [PATCH 34/35] fix to enable successful build with mingw64

---
 src/util/alloc.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/util/alloc.cpp b/src/util/alloc.cpp
index f3a2a259..40004932 100644
--- a/src/util/alloc.cpp
+++ b/src/util/alloc.cpp
@@ -47,7 +47,15 @@ namespace ue2 {
 #endif
 
 /* get us a posix_memalign from somewhere */
-#if !defined(HAVE_POSIX_MEMALIGN)
+#if defined(__MINGW32__) || defined(__MINGW64__)
+  #include <stdlib.h>
+  #include <intrin.h>
+  #include <malloc.h>
+  #include <windows.h>
+
+  #define posix_memalign(A, B, C) ((*A = (void *)__mingw_aligned_malloc(C, B)) == nullptr)
+
+#elif !defined(HAVE_POSIX_MEMALIGN)
 # if defined(HAVE_MEMALIGN)
     #define posix_memalign(A, B, C) ((*A = (void *)memalign(B, C)) == nullptr)
 # elif defined(HAVE__ALIGNED_MALLOC)
@@ -77,7 +85,11 @@ void aligned_free_internal(void *ptr) {
         return;
     }
 
+#if defined(__MINGW32__) || defined(__MINGW64__)
+    __mingw_aligned_free(ptr);
+#else
     free(ptr);
+#endif
 }
 
 /** \brief 64-byte aligned, zeroed malloc.

From 67b414f2f9e543e894ea3204e6ce71721a0c251b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 12 Sep 2022 13:09:51 +0000
Subject: [PATCH 35/35] [NEON] simplify/optimize shift/align primitives

---
 src/util/arch/arm/simd_utils.h         | 220 +------------------------
 src/util/supervector/arch/arm/impl.cpp |  96 ++++-------
 2 files changed, 41 insertions(+), 275 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 45bcd23c..7f8539b0 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -112,43 +112,8 @@ m128 lshift_m128(m128 a, unsigned b) {
         return (m128) vshlq_n_u32((uint32x4_t)a, b);
     }
 #endif
-#define CASE_LSHIFT_m128(a, offset)  case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break;
-    switch (b) {
-    case 0:  return a; break;
-    CASE_LSHIFT_m128(a,  1);
-    CASE_LSHIFT_m128(a,  2);
-    CASE_LSHIFT_m128(a,  3);
-    CASE_LSHIFT_m128(a,  4);
-    CASE_LSHIFT_m128(a,  5);
-    CASE_LSHIFT_m128(a,  6);
-    CASE_LSHIFT_m128(a,  7);
-    CASE_LSHIFT_m128(a,  8);
-    CASE_LSHIFT_m128(a,  9);
-    CASE_LSHIFT_m128(a, 10);
-    CASE_LSHIFT_m128(a, 11);
-    CASE_LSHIFT_m128(a, 12);
-    CASE_LSHIFT_m128(a, 13);
-    CASE_LSHIFT_m128(a, 14);
-    CASE_LSHIFT_m128(a, 15);
-    CASE_LSHIFT_m128(a, 16);
-    CASE_LSHIFT_m128(a, 17);
-    CASE_LSHIFT_m128(a, 18);
-    CASE_LSHIFT_m128(a, 19);
-    CASE_LSHIFT_m128(a, 20);
-    CASE_LSHIFT_m128(a, 21);
-    CASE_LSHIFT_m128(a, 22);
-    CASE_LSHIFT_m128(a, 23);
-    CASE_LSHIFT_m128(a, 24);
-    CASE_LSHIFT_m128(a, 25);
-    CASE_LSHIFT_m128(a, 26);
-    CASE_LSHIFT_m128(a, 27);
-    CASE_LSHIFT_m128(a, 28);
-    CASE_LSHIFT_m128(a, 29);
-    CASE_LSHIFT_m128(a, 30);
-    CASE_LSHIFT_m128(a, 31);
-    default: return zeroes128(); break;
-    }
-#undef CASE_LSHIFT_m128
+  int32x4_t shift_indices = vdupq_n_s32(b);
+  return (m128) vshlq_s32(a, shift_indices);
 }
 
 static really_really_inline
@@ -158,43 +123,8 @@ m128 rshift_m128(m128 a, unsigned b) {
         return (m128) vshrq_n_u32((uint32x4_t)a, b);
     }
 #endif
-#define CASE_RSHIFT_m128(a, offset)  case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break;
-    switch (b) {
-    case 0:  return a; break;
-    CASE_RSHIFT_m128(a,  1);
-    CASE_RSHIFT_m128(a,  2);
-    CASE_RSHIFT_m128(a,  3);
-    CASE_RSHIFT_m128(a,  4);
-    CASE_RSHIFT_m128(a,  5);
-    CASE_RSHIFT_m128(a,  6);
-    CASE_RSHIFT_m128(a,  7);
-    CASE_RSHIFT_m128(a,  8);
-    CASE_RSHIFT_m128(a,  9);
-    CASE_RSHIFT_m128(a, 10);
-    CASE_RSHIFT_m128(a, 11);
-    CASE_RSHIFT_m128(a, 12);
-    CASE_RSHIFT_m128(a, 13);
-    CASE_RSHIFT_m128(a, 14);
-    CASE_RSHIFT_m128(a, 15);
-    CASE_RSHIFT_m128(a, 16);
-    CASE_RSHIFT_m128(a, 17);
-    CASE_RSHIFT_m128(a, 18);
-    CASE_RSHIFT_m128(a, 19);
-    CASE_RSHIFT_m128(a, 20);
-    CASE_RSHIFT_m128(a, 21);
-    CASE_RSHIFT_m128(a, 22);
-    CASE_RSHIFT_m128(a, 23);
-    CASE_RSHIFT_m128(a, 24);
-    CASE_RSHIFT_m128(a, 25);
-    CASE_RSHIFT_m128(a, 26);
-    CASE_RSHIFT_m128(a, 27);
-    CASE_RSHIFT_m128(a, 28);
-    CASE_RSHIFT_m128(a, 29);
-    CASE_RSHIFT_m128(a, 30);
-    CASE_RSHIFT_m128(a, 31);
-    default: return zeroes128(); break;
-    }
-#undef CASE_RSHIFT_m128
+  int32x4_t shift_indices = vdupq_n_s32(-b);
+  return (m128) vshlq_s32(a, shift_indices);
 }
 
 static really_really_inline
@@ -204,75 +134,8 @@ m128 lshift64_m128(m128 a, unsigned b) {
         return (m128) vshlq_n_u64((uint64x2_t)a, b);
     }
 #endif
-#define CASE_LSHIFT64_m128(a, offset)  case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break;
-    switch (b) {
-    case 0:  return a; break;
-    CASE_LSHIFT64_m128(a,  1);
-    CASE_LSHIFT64_m128(a,  2);
-    CASE_LSHIFT64_m128(a,  3);
-    CASE_LSHIFT64_m128(a,  4);
-    CASE_LSHIFT64_m128(a,  5);
-    CASE_LSHIFT64_m128(a,  6);
-    CASE_LSHIFT64_m128(a,  7);
-    CASE_LSHIFT64_m128(a,  8);
-    CASE_LSHIFT64_m128(a,  9);
-    CASE_LSHIFT64_m128(a, 10);
-    CASE_LSHIFT64_m128(a, 11);
-    CASE_LSHIFT64_m128(a, 12);
-    CASE_LSHIFT64_m128(a, 13);
-    CASE_LSHIFT64_m128(a, 14);
-    CASE_LSHIFT64_m128(a, 15);
-    CASE_LSHIFT64_m128(a, 16);
-    CASE_LSHIFT64_m128(a, 17);
-    CASE_LSHIFT64_m128(a, 18);
-    CASE_LSHIFT64_m128(a, 19);
-    CASE_LSHIFT64_m128(a, 20);
-    CASE_LSHIFT64_m128(a, 21);
-    CASE_LSHIFT64_m128(a, 22);
-    CASE_LSHIFT64_m128(a, 23);
-    CASE_LSHIFT64_m128(a, 24);
-    CASE_LSHIFT64_m128(a, 25);
-    CASE_LSHIFT64_m128(a, 26);
-    CASE_LSHIFT64_m128(a, 27);
-    CASE_LSHIFT64_m128(a, 28);
-    CASE_LSHIFT64_m128(a, 29);
-    CASE_LSHIFT64_m128(a, 30);
-    CASE_LSHIFT64_m128(a, 31);
-    CASE_LSHIFT64_m128(a, 32);
-    CASE_LSHIFT64_m128(a, 33);
-    CASE_LSHIFT64_m128(a, 34);
-    CASE_LSHIFT64_m128(a, 35);
-    CASE_LSHIFT64_m128(a, 36);
-    CASE_LSHIFT64_m128(a, 37);
-    CASE_LSHIFT64_m128(a, 38);
-    CASE_LSHIFT64_m128(a, 39);
-    CASE_LSHIFT64_m128(a, 40);
-    CASE_LSHIFT64_m128(a, 41);
-    CASE_LSHIFT64_m128(a, 42);
-    CASE_LSHIFT64_m128(a, 43);
-    CASE_LSHIFT64_m128(a, 44);
-    CASE_LSHIFT64_m128(a, 45);
-    CASE_LSHIFT64_m128(a, 46);
-    CASE_LSHIFT64_m128(a, 47);
-    CASE_LSHIFT64_m128(a, 48);
-    CASE_LSHIFT64_m128(a, 49);
-    CASE_LSHIFT64_m128(a, 50);
-    CASE_LSHIFT64_m128(a, 51);
-    CASE_LSHIFT64_m128(a, 52);
-    CASE_LSHIFT64_m128(a, 53);
-    CASE_LSHIFT64_m128(a, 54);
-    CASE_LSHIFT64_m128(a, 55);
-    CASE_LSHIFT64_m128(a, 56);
-    CASE_LSHIFT64_m128(a, 57);
-    CASE_LSHIFT64_m128(a, 58);
-    CASE_LSHIFT64_m128(a, 59);
-    CASE_LSHIFT64_m128(a, 60);
-    CASE_LSHIFT64_m128(a, 61);
-    CASE_LSHIFT64_m128(a, 62);
-    CASE_LSHIFT64_m128(a, 63);
-    default: return zeroes128(); break;
-    }
-#undef CASE_LSHIFT64_m128
+  int64x2_t shift_indices = vdupq_n_s64(b);
+  return (m128) vshlq_s64((int64x2_t) a, shift_indices);
 }
 
 static really_really_inline
@@ -282,75 +145,8 @@ m128 rshift64_m128(m128 a, unsigned b) {
         return (m128) vshrq_n_u64((uint64x2_t)a, b);
     }
 #endif
-#define CASE_RSHIFT64_m128(a, offset)  case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break;
-    switch (b) {
-    case 0:  return a; break;
-    CASE_RSHIFT64_m128(a,  1);
-    CASE_RSHIFT64_m128(a,  2);
-    CASE_RSHIFT64_m128(a,  3);
-    CASE_RSHIFT64_m128(a,  4);
-    CASE_RSHIFT64_m128(a,  5);
-    CASE_RSHIFT64_m128(a,  6);
-    CASE_RSHIFT64_m128(a,  7);
-    CASE_RSHIFT64_m128(a,  8);
-    CASE_RSHIFT64_m128(a,  9);
-    CASE_RSHIFT64_m128(a, 10);
-    CASE_RSHIFT64_m128(a, 11);
-    CASE_RSHIFT64_m128(a, 12);
-    CASE_RSHIFT64_m128(a, 13);
-    CASE_RSHIFT64_m128(a, 14);
-    CASE_RSHIFT64_m128(a, 15);
-    CASE_RSHIFT64_m128(a, 16);
-    CASE_RSHIFT64_m128(a, 17);
-    CASE_RSHIFT64_m128(a, 18);
-    CASE_RSHIFT64_m128(a, 19);
-    CASE_RSHIFT64_m128(a, 20);
-    CASE_RSHIFT64_m128(a, 21);
-    CASE_RSHIFT64_m128(a, 22);
-    CASE_RSHIFT64_m128(a, 23);
-    CASE_RSHIFT64_m128(a, 24);
-    CASE_RSHIFT64_m128(a, 25);
-    CASE_RSHIFT64_m128(a, 26);
-    CASE_RSHIFT64_m128(a, 27);
-    CASE_RSHIFT64_m128(a, 28);
-    CASE_RSHIFT64_m128(a, 29);
-    CASE_RSHIFT64_m128(a, 30);
-    CASE_RSHIFT64_m128(a, 31);
-    CASE_RSHIFT64_m128(a, 32);
-    CASE_RSHIFT64_m128(a, 33);
-    CASE_RSHIFT64_m128(a, 34);
-    CASE_RSHIFT64_m128(a, 35);
-    CASE_RSHIFT64_m128(a, 36);
-    CASE_RSHIFT64_m128(a, 37);
-    CASE_RSHIFT64_m128(a, 38);
-    CASE_RSHIFT64_m128(a, 39);
-    CASE_RSHIFT64_m128(a, 40);
-    CASE_RSHIFT64_m128(a, 41);
-    CASE_RSHIFT64_m128(a, 42);
-    CASE_RSHIFT64_m128(a, 43);
-    CASE_RSHIFT64_m128(a, 44);
-    CASE_RSHIFT64_m128(a, 45);
-    CASE_RSHIFT64_m128(a, 46);
-    CASE_RSHIFT64_m128(a, 47);
-    CASE_RSHIFT64_m128(a, 48);
-    CASE_RSHIFT64_m128(a, 49);
-    CASE_RSHIFT64_m128(a, 50);
-    CASE_RSHIFT64_m128(a, 51);
-    CASE_RSHIFT64_m128(a, 52);
-    CASE_RSHIFT64_m128(a, 53);
-    CASE_RSHIFT64_m128(a, 54);
-    CASE_RSHIFT64_m128(a, 55);
-    CASE_RSHIFT64_m128(a, 56);
-    CASE_RSHIFT64_m128(a, 57);
-    CASE_RSHIFT64_m128(a, 58);
-    CASE_RSHIFT64_m128(a, 59);
-    CASE_RSHIFT64_m128(a, 60);
-    CASE_RSHIFT64_m128(a, 61);
-    CASE_RSHIFT64_m128(a, 62);
-    CASE_RSHIFT64_m128(a, 63);
-    default: return zeroes128(); break;
-    }
-#undef CASE_RSHIFT64_m128
+  int64x2_t shift_indices = vdupq_n_s64(-b);
+  return (m128) vshlq_s64((int64x2_t) a, shift_indices);
 }
 
 static really_inline m128 eq128(m128 a, m128 b) {
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index b3e4233e..5283ab00 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -374,10 +374,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
-    return result;
+    if (N == 8) return Zeroes();
+    int8x16_t shift_indices = vdupq_n_s8(N);
+    return { vshlq_s8(u.s8x16[0], shift_indices) };
 }
 
 template <>
@@ -385,9 +384,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
-    return result;
+    int16x8_t shift_indices = vdupq_n_s16(N);
+    return { vshlq_s16(u.s16x8[0], shift_indices) };
 }
 
 template <>
@@ -395,9 +393,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
-    return result;
+    int32x4_t shift_indices = vdupq_n_s32(N);
+    return { vshlq_s32(u.s32x4[0], shift_indices) };
 }
 
 template <>
@@ -405,9 +402,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 64) return Zeroes();
-    SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
-    return result;
+    int64x2_t shift_indices = vdupq_n_s64(N);
+    return { vshlq_s64(u.s64x2[0], shift_indices) };
 }
 
 template <>
@@ -415,6 +411,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
+    }
+#endif
     SuperVector result;
     Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
     return result;
@@ -431,9 +432,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 8) return Zeroes();
-    SuperVector result;
-    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
-    return result;
+    int8x16_t shift_indices = vdupq_n_s8(-N);
+    return { vshlq_s8(u.s8x16[0], shift_indices) };
 }
 
 template <>
@@ -441,9 +441,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
-    return result;
+    int16x8_t shift_indices = vdupq_n_s16(-N);
+    return { vshlq_s16(u.s16x8[0], shift_indices) };
 }
 
 template <>
@@ -451,9 +450,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 32) return Zeroes();
-    SuperVector result;
-    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
-    return result;
+    int32x4_t shift_indices = vdupq_n_s32(-N);
+    return { vshlq_s32(u.s32x4[0], shift_indices) };
 }
 
 template <>
@@ -461,9 +459,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 64) return Zeroes();
-    SuperVector result;
-    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
-    return result;
+    int64x2_t shift_indices = vdupq_n_s64(-N);
+    return { vshlq_s64(u.s64x2[0], shift_indices) };
 }
 
 template <>
@@ -471,6 +468,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+         return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
+    }
+#endif
     SuperVector result;
     Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
     return result;
@@ -485,22 +487,12 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(N)) {
-         return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
-    }
-#endif
     return vshr_128(N);
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(N)) {
-        return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
-    }
-#endif
     return vshl_128(N);
 }
 
@@ -534,45 +526,23 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
     SuperVector mask = Ones_vshr(16 -len);
-    //mask.print8("mask");
     SuperVector<16> v = loadu(ptr);
-    //v.print8("v");
     return mask & v;
 }
 
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {
+    if (offset == 0) return other;
+    if (offset == 16) return *this;
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(offset)) {
-        if (offset == 16) {
-            return *this;
-        } else {
-            return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
-        }
+        return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
     }
 #endif
-    switch(offset) {
-    case 0: return other; break;
-    case 1: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 1)}; break;
-    case 2: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 2)}; break;
-    case 3: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 3)}; break;
-    case 4: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 4)}; break;
-    case 5: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 5)}; break;
-    case 6: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 6)}; break;
-    case 7: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 7)}; break;
-    case 8: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 8)}; break;
-    case 9: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 9)}; break;
-    case 10: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 10)}; break;
-    case 11: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 11)}; break;
-    case 12: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 12)}; break;
-    case 13: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 13)}; break;
-    case 14: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 14)}; break;
-    case 15: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 15)}; break;
-    case 16: return *this; break;
-    default: break;
-    }
-    return *this;
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {vextq_u8(other.u.u8x16[0], v->u.u8x16[0], n)}; });
+    return result;
 }
 
 template<>