From 49eb18ee4f21b5bd389e0e9d5644be1ec1dc85c6 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <danilak@google.com>
Date: Sun, 26 Jun 2022 22:50:05 +0000
Subject: [PATCH] Optimize vectorscan for aarch64 by using shrn instruction

This optimization is based on the thread
https://twitter.com/Danlark1/status/1539344279268691970 and uses
shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate--

To achieve that, I needed to redesign a little movemask into comparemask
and have an additional step towards mask iteration. Our benchmarks
showed 10-15% improvement on average for long matches.
---
 src/hwlm/noodle_engine_simd.hpp            | 55 ++++++++++------
 src/nfa/limex_shuffle.hpp                  | 14 +++-
 src/util/arch/arm/match.hpp                | 40 ++++++------
 src/util/arch/arm/simd_utils.h             | 27 ++++----
 src/util/arch/ppc64el/match.hpp            |  8 +--
 src/util/arch/x86/match.hpp                | 72 +++++++++++---------
 src/util/supervector/arch/arm/impl.cpp     | 32 ++++-----
 src/util/supervector/arch/ppc64el/impl.cpp | 18 +++--
 src/util/supervector/arch/x86/impl.cpp     | 76 +++++++++++++++-------
 src/util/supervector/supervector.hpp       | 36 +++++++---
 unit/internal/supervector.cpp              | 36 ++++++----
 11 files changed, 264 insertions(+), 150 deletions(-)
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
index c49bfc7e..8006bd79 100644
--- a/src/hwlm/noodle_engine_simd.hpp
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -36,7 +36,7 @@ static really_really_inline
 hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 		Z_TYPE z, size_t len, const struct cb_info *cbi) {
     while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
         size_t matchPos = d - buf + pos;
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
@@ -49,7 +49,7 @@ static really_really_inline
 hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 		Z_TYPE z, size_t len, const struct cb_info *cbi) {
     while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
         size_t matchPos = d - buf + pos - 1;
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
@@ -77,9 +77,11 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
     SuperVector<S> v = SuperVector<S>::Zeroes();
     memcpy(&v.u, d, l);
 
-    typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l);
+    typename SuperVector<S>::comparemask_type mask =
+        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
     v = v & caseMask;
-    typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+    z = SuperVector<S>::iteration_mask(z);
 
     return single_zscan(n, d, buf, z, len, cbi);
 }
@@ -103,9 +105,12 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
         return HWLM_SUCCESS;
     }
     size_t buf_off = start - offset;
-    typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off;
+    typename SuperVector<S>::comparemask_type mask =
+        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+        << (buf_off * SuperVector<S>::mask_width());
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
-    typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+    z = SuperVector<S>::iteration_mask(z);
 
     return single_zscan(n, d, buf, z, len, cbi);
 }
@@ -126,10 +131,13 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
     memcpy(&v.u, d, l);
     v = v & caseMask;
 
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
-    typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+    typename SuperVector<S>::comparemask_type mask =
+        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+    typename SuperVector<S>::comparemask_type z =
+        mask & (z1 << (SuperVector<S>::mask_width())) & z2;
+    z = SuperVector<S>::iteration_mask(z);
 
     return double_zscan(n, d, buf, z, len, cbi);
 }
@@ -148,10 +156,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
     }
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
     size_t buf_off = start - offset;
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off;
-    typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+    typename SuperVector<S>::comparemask_type mask =
+        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+        << (buf_off * SuperVector<S>::mask_width());
+    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+    typename SuperVector<S>::comparemask_type z =
+        mask & (z1 << SuperVector<S>::mask_width()) & z2;
+    z = SuperVector<S>::iteration_mask(z);
 
     return double_zscan(n, d, buf, z, len, cbi);
 }
@@ -191,7 +203,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
             __builtin_prefetch(base + 256);
 
             SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-            typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
+            typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
+            z = SuperVector<S>::iteration_mask(z);
 
             hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
             RETURN_IF_TERMINATED(rv);
@@ -220,7 +233,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
 
     size_t start = offset + n->msk_len - n->key_offset;
 
-    typename SuperVector<S>::movemask_type lastz1{0};
+    typename SuperVector<S>::comparemask_type lastz1{0};
 
     const u8 *d = buf + start;
     const u8 *e = buf + end;
@@ -248,10 +261,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
             __builtin_prefetch(base + 256);
 
             SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-            typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-            typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-            typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
-            lastz1 = z1 >> Z_SHIFT;
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+            typename SuperVector<S>::comparemask_type z =
+                (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+            lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
+            z = SuperVector<S>::iteration_mask(z);
 
             hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
             RETURN_IF_TERMINATED(rv);
diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp
index 4266d7da..367d400b 100644
--- a/src/nfa/limex_shuffle.hpp
+++ b/src/nfa/limex_shuffle.hpp
@@ -53,7 +53,15 @@ really_really_inline
 u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
     SuperVector<16> shuffled = s.pshufb<true>(permute);
     SuperVector<16> compared = shuffled & compare;
-    u16 rv = ~compared.eqmask(shuffled);
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
+    if (SuperVector<16>::mask_width() != 1) {
+        u32 ans = 0;
+        for (u32 i = 0; i < 16; ++i) {
+            ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
+                   (i * SuperVector<16>::mask_width() - i);
+        }
+        return ans;
+    }
     return (u32)rv;
 }
 
@@ -62,7 +70,8 @@ really_really_inline
 u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
     SuperVector<32> shuffled = s.pshufb<true>(permute);
     SuperVector<32> compared = shuffled & compare;
-    u32 rv = ~compared.eqmask(shuffled); 
+    // TODO(danlark1): Future ARM support might have a bug.
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
     return (u32)((rv >> 16) | (rv & 0xffffU));
 }
 
@@ -71,6 +80,7 @@ really_really_inline
 u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
     SuperVector<64> shuffled = s.pshufb<true>(permute);
     SuperVector<64> compared = shuffled & compare;
+    // TODO(danlark1): Future ARM support might have a bug.
     u64a rv = ~compared.eqmask(shuffled);
     rv = rv >> 32 | rv;
     return (u32)(((rv >> 16) | rv) & 0xffffU);
diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp
index 892c3877..1280fed5 100644
--- a/src/util/arch/arm/match.hpp
+++ b/src/util/arch/arm/match.hpp
@@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-    typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("z %08x\n", z);
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        u32 pos = ctz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("z %08llx\n", z);
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
-        DEBUG_PRINTF("buf + pos %p\n", buf + pos);
+        DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
         return buf + pos;
     } else {
         return NULL; // no match
@@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-    typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        DEBUG_PRINTF("z %08x\n", z);
-        u32 pos = clz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        DEBUG_PRINTF("z %08llx\n", z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
+        return buf + (15 - pos);
     } else {
         return NULL; // no match
     }
@@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-	typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("z %08x\n", z);
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        u32 pos = ctz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("z %08llx\n", z);
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         DEBUG_PRINTF("buf + pos %p\n", buf + pos);
@@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     uint32x4_t m = mask.u.u32x4[0];
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
-	typename SuperVector<16>::movemask_type z = mask.movemask();
-        DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-        DEBUG_PRINTF("z %08x\n", z);
-        u32 pos = clz32(z & 0xffff);
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        DEBUG_PRINTF("z %08llx\n", z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
+        return buf + (15 - pos);
     } else {
         return NULL; // no match
     }
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index e6836b25..68c29c67 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) {
 
 /** \brief Return 1 if a and b are different otherwise 0 */
 static really_inline int diff128(m128 a, m128 b) {
-    int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
-    return (-16 != res);
+    uint64_t res = vget_lane_u64(
+        (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
+    return (~0ull != res);
 }
 
 static really_inline int isnonzero128(m128 a) {
@@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
 }
 
 static really_inline u32 movemask128(m128 a) {
-    uint8x16_t input = vreinterpretq_u8_s32(a);
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+    static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
+                                      1, 2, 4, 8, 16, 32, 64, 128};
+
+    // Compute the mask from the input
+    uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
+        vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+    uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
+    mask = vorrq_u8(mask, mask1);
+
+    // Get the resulting bytes
+    uint16_t output;
+    vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
+    return output;
 }
 
 static really_inline m128 set1_16x8(u8 c) {
diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp
index a3f52e41..4f7cc7f1 100644
--- a/src/util/arch/ppc64el/match.hpp
+++ b/src/util/arch/ppc64el/match.hpp
@@ -30,7 +30,7 @@
 template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z)) {
@@ -47,7 +47,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
 template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z)) {
@@ -63,7 +63,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z != 0xffff)) {
@@ -81,7 +81,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
-    SuperVector<16>::movemask_type z = v.movemask();
+    SuperVector<16>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     DEBUG_PRINTF("z %08x\n", z);
     if (unlikely(z != 0xffff)) {
diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp
index cbf4ab6b..d237567f 100644
--- a/src/util/arch/x86/match.hpp
+++ b/src/util/arch/x86/match.hpp
@@ -30,12 +30,13 @@
 template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
 template <>
 really_really_inline
 const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
         assert(pos < 32);
@@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U
 template <>
 really_really_inline
 const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le
 template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
 template <>
 really_really_inline
 const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         assert(pos < 32);
@@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN
 template <>
 really_really_inline
 const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("z 0x%08x\n", z);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
     if (unlikely(z != 0xffffffff)) {
-        u32 pos = ctz32(~z);
+        u32 pos = ctz32(~z & 0xffffffffu);
         assert(pos < 32);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         return buf + pos;
@@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co
 template <>
 really_really_inline
 const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
-    SuperVector<64>::movemask_type z = v.movemask();
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
@@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
-    SuperVector<16>::movemask_type z = v.movemask();
-    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
-    DEBUG_PRINTF("z %08x\n", z);
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+    DEBUG_PRINTF("z %08llx\n", z);
     if (unlikely(z != 0xffff)) {
-        u32 pos = clz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08x\n", ~z);
+        u32 pos = clz32(~z & 0xffffu);
+        DEBUG_PRINTF("~z %08llx\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);
@@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_
 template<>
 really_really_inline
 const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
-    SuperVector<32>::movemask_type z = v.movemask();
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = clz32(~z & 0xffffffff);
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
+        u32 pos = clz32(~z & 0xffffffffu);
         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
         assert(pos < 32);
         return buf + (31 - pos);
@@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_
 template <>
 really_really_inline
 const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
+    assert(SuperVector<64>::mask_width() == 1);
     v.print8("v");
-    SuperVector<64>::movemask_type z = v.movemask();
+    SuperVector<64>::comparemask_type z = v.comparemask();
     DEBUG_PRINTF("z 0x%016llx\n", z);
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index 89497d3d..b3e4233e 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
-{
-    SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
-
-    // Compute the mask from the input
-    uint8x16_t mask  = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
-    uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
-    mask = vorrq_u8(mask, (uint8x16_t) mask1);
-
-    // Get the resulting bytes
-    uint16_t output;
-    vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
-    return static_cast<typename SuperVector<16>::movemask_type>(output);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return static_cast<typename SuperVector<16>::comparemask_type>(
+        vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask & 0x1111111111111111ull;
 }
 
 template <>
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index 109b8d5e..5becb8f8 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{ 
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
     uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
     
     uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
@@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();  
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
 }
 
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
+}
 
 template <>
 template<uint8_t N>
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index 157f1dc4..c9daf0cf 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{
-    return _mm_movemask_epi8(u.v128[0]);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return (u32)_mm_movemask_epi8(u.v128[0]);
 }
 
 template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
 }
 
 // template <>
@@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
-{
-    return _mm256_movemask_epi8(u.v256[0]);
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::comparemask(void) const {
+    return (u32)_mm256_movemask_epi8(u.v256[0]);
 }
 
 template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
-{
-    return eq(b).movemask();
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::eqmask(SuperVector<32> const b) const {
+    return eq(b).comparemask();
 }
 
+template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::iteration_mask(
+    typename SuperVector<32>::comparemask_type mask) {
+    return mask;
+}
 
 // template <>
 // template<uint8_t N>
@@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
 template <>
 really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
 {
-    SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
     return {_mm512_movm_epi8(mask)};
 }
 
@@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons
 }
 
 template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::comparemask(void) const {
     __m512i msb = _mm512_set1_epi8(0xFF);
     __m512i mask = _mm512_and_si512(msb, u.v512[0]);
     return _mm512_cmpeq_epi8_mask(mask, msb);
 }
 
 template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::eqmask(SuperVector<64> const b) const {
     return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
 }
 
+template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::iteration_mask(
+    typename SuperVector<64>::comparemask_type mask) {
+    return mask;
+}
+
 // template <>
 // template<uint8_t N>
 // really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index f0ddf63c..51310db2 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -46,19 +46,29 @@
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_SHIFT 63
+#define Z_POSSHIFT 0
 #define DOUBLE_LOAD_MASK(l)        ((~0ULL) >> (Z_BITS -(l)))
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_256_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 31
+#define Z_POSSHIFT 0
 #define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+using Z_TYPE = u64a;
+#define Z_BITS 64
+#define Z_POSSHIFT 2
+#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
+#else
 using Z_TYPE = u32;
 #define Z_BITS 32
+#define Z_POSSHIFT 0
+#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
+#endif
 #define Z_SHIFT 15
-#define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #endif
 
@@ -94,7 +104,8 @@ struct BaseVector
   static constexpr bool      is_valid = false;
   static constexpr u16           size = 8;
   using                          type = void;
-  using                 movemask_type = void;
+  using comparemask_type = void;
+  using cmpmask_type = void;
   static constexpr bool  has_previous = false;
   using                 previous_type = void;
   static constexpr u16  previous_size = 4;
@@ -106,7 +117,7 @@ struct BaseVector<128>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 128;
   using                          type = void;
-  using                 movemask_type = u64a;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m512;
   static constexpr u16  previous_size = 64;
@@ -118,7 +129,7 @@ struct BaseVector<64>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 64;
   using                          type = m512;
-  using                 movemask_type = u64a;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m256;
   static constexpr u16  previous_size = 32;
@@ -131,7 +142,7 @@ struct BaseVector<32>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 32;
   using                          type = m256;
-  using                 movemask_type = u32;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = true;
   using                 previous_type = m128;
   static constexpr u16  previous_size = 16;
@@ -144,7 +155,7 @@ struct BaseVector<16>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 16;
   using                          type = m128;
-  using                 movemask_type = u32;
+  using comparemask_type = u64a;
   static constexpr bool  has_previous = false;
   using                 previous_type = u64a;
   static constexpr u16  previous_size = 8;
@@ -231,8 +242,17 @@ public:
   SuperVector eq(SuperVector const &b) const;
   SuperVector operator<<(uint8_t const N) const;
   SuperVector operator>>(uint8_t const N) const;
-  typename base_type::movemask_type movemask(void) const;
-  typename base_type::movemask_type eqmask(SuperVector const b) const;
+  // Returns mask_width groups of zeros or ones. To get the mask which can be
+  // iterated, use iteration_mask method, it ensures only one bit is set per
+  // mask_width group.
+  // Precondition: all bytes must be 0 or 0xff.
+  typename base_type::comparemask_type comparemask(void) const;
+  typename base_type::comparemask_type eqmask(SuperVector const b) const;
+  static u32 mask_width();
+  // Returns a mask with at most 1 bit set to 1. It can be used to iterate
+  // over bits through ctz/clz and lowest bit clear.
+  static typename base_type::comparemask_type
+  iteration_mask(typename base_type::comparemask_type mask);
 
   static SuperVector loadu(void const *ptr);
   static SuperVector load(void const *ptr);
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index deb3b169..0b4cae58 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -176,9 +176,9 @@ TEST(SuperVectorUtilsTest,Movemask128c){
         }
     }
     auto SP = SuperVector<16>::loadu(vec);
-    u16 mask = SP.movemask();
-    for(int i=0; i<16; i++) {
-        if (mask & (1 << i)) {
+    u64a mask = SP.comparemask();
+    for (int i = 0; i < 16; i++) {
+        if (mask & (1ull << (i * SuperVector<16>::mask_width()))) {
             vec2[i] = 0xff;
         }
     }
@@ -195,15 +195,21 @@ TEST(SuperVectorUtilsTest,Eqmask128c){
     for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
     auto SP = SuperVector<16>::loadu(vec);
     auto SP1 = SuperVector<16>::loadu(vec2);
-    int mask = SP.eqmask(SP);
-    ASSERT_EQ(mask,0xFFFF);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 16; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);
     vec2[0] = vec[0];
     vec2[1] = vec[1];
     auto SP2 = SuperVector<16>::loadu(vec2);
     mask = SP.eqmask(SP2);
-    ASSERT_EQ(mask,3);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width()));
+    for (u32 i = 2; i < 16; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
 }
 
 /*Define LSHIFT128 macro*/
@@ -507,9 +513,9 @@ TEST(SuperVectorUtilsTest,Movemask256c){
         }
     }
     auto SP = SuperVector<32>::loadu(vec);
-    u32 mask = SP.movemask();
+    u64a mask = SP.comparemask();
     for(int i=0; i<32; i++) {
-        if (mask & (1 << i)) {
+        if (mask & (1ull << (i * SuperVector<32>::mask_width()))) {
             vec2[i] = 0xff;
         }
     }
@@ -527,15 +533,21 @@ TEST(SuperVectorUtilsTest,Eqmask256c){
     for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
     auto SP = SuperVector<32>::loadu(vec);
     auto SP1 = SuperVector<32>::loadu(vec2);
-    u32 mask = SP.eqmask(SP);
-    ASSERT_EQ(mask,0xFFFFFFFF);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 32; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);
     vec2[0] = vec[0];
     vec2[1] = vec[1];
     auto SP2 = SuperVector<32>::loadu(vec2);
     mask = SP.eqmask(SP2);
-    ASSERT_EQ(mask,3);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width()));
+    for (u32 i = 2; i < 32; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
 }
 
 TEST(SuperVectorUtilsTest,pshufb256c) {
@@ -871,6 +883,8 @@ TEST(SuperVectorUtilsTest,Eqmask512c){
     auto SP = SuperVector<64>::loadu(vec);
     auto SP1 = SuperVector<64>::loadu(vec2);
     u64a mask = SP.eqmask(SP);
+    // Mask width for 64 bit type cannot be more than 1.
+    ASSERT_EQ(SuperVector<64>::mask_width(), 1);
     ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
     mask = SP.eqmask(SP1);
     ASSERT_EQ(mask,0);