Initial commit of Hyperscan

2025-10-09 15:52:27 +03:00 · 2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_H
+#define BITUTILS_H
+
+#include "ue2common.h"
+#include "popcount.h"
+
+#ifdef __cplusplus
+# if defined(HAVE_CXX_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#else // C, baby
+# if defined(HAVE_C_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#endif
+
+#ifdef __cplusplus
+# if defined(HAVE_CXX_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#else // C, baby
+# if defined(HAVE_C_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#endif
+
+#if defined(USE_X86INTRIN_H)
+#include <x86intrin.h>
+#elif defined(USE_INTRIN_H)
+#include <intrin.h>
+#endif
+
+// MSVC has a different form of inline asm
+#ifdef _WIN32
+#define NO_ASM
+#endif
+
+#define CASE_BIT          0x20
+#define CASE_CLEAR        0xdf
+#define DOUBLE_CASE_CLEAR 0xdfdf
+
+static really_inline
+u32 clz32(u32 x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanReverse(&r, x);
+    return 31 - r;
+#else
+    return (u32)__builtin_clz(x);
+#endif
+}
+
+static really_inline
+u32 clz64(u64a x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanReverse64(&r, x);
+    return 63 - r;
+#else
+    return (u32)__builtin_clzll(x);
+#endif
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32(u32 x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanForward(&r, x);
+    return r;
+#else
+    return (u32)__builtin_ctz(x);
+#endif
+}
+
+static really_inline
+u32 ctz64(u64a x) {
+    assert(x); // behaviour not defined for x == 0
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanForward64(&r, x);
+    return r;
+#else
+    return (u32)__builtin_ctzll(x);
+#endif
+}
+
+static really_inline
+u32 lg2(u32 x) {
+    if (!x) {
+        return 0;
+    }
+    return 31 - clz32(x);
+}
+
+static really_inline
+u64a lg2_64(u64a x) {
+    if (!x) {
+        return 0;
+    }
+    return 63 - clz64(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32(u32 *v) {
+    assert(*v != 0); // behaviour not defined in this case
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsf %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = ctz32(val);
+    *v = val & (val - 1);
+#endif
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearLSB_64(u64a *v) {
+    assert(*v != 0); // behaviour not defined in this case
+
+#ifdef ARCH_64_BIT
+#if defined(ARCH_X86_64) && !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsfq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64(val);
+    *v = val & (val - 1);
+#endif // ARCH_X86_64
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = *v;
+    u32 v2 = (*v >> 32);
+    u32 offset;
+    if (v1) {
+        offset = findAndClearLSB_32(&v1);
+        *v = (u64a)v1 | ((u64a)v2 << 32);
+    } else {
+        offset = findAndClearLSB_32(&v2) + 32;
+        *v = (u64a)v2 << 32;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 findAndClearMSB_32(u32 *v) {
+    assert(*v != 0); // behaviour not defined in this case
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsr %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = 31 - clz32(val);
+    *v = val & ~(1 << offset);
+#endif
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64(u64a *v) {
+    assert(*v != 0); // behaviour not defined in this case
+
+#ifdef ARCH_64_BIT
+#if defined(ARCH_X86_64) && !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsrq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64(val);
+    *v = val & ~(1ULL << offset);
+#endif // ARCH_X86_64
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = *v;
+    u32 v2 = (*v >> 32);
+    u32 offset;
+    if (v2) {
+        offset = findAndClearMSB_32(&v2) + 32;
+        *v = ((u64a)v2 << 32) | (u64a)v1;
+    } else {
+        offset = findAndClearMSB_32(&v1);
+        *v = (u64a)v1;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 compress32(u32 x, u32 m) {
+#if defined(__BMI2__)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u32(x, m);
+#endif
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u32 mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+}
+
+static really_inline
+u64a compress64(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(__BMI2__)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u64(x, m);
+#endif
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u64a mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+        mp ^= mp << 32;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+}
+
+static really_inline
+u32 expand32(u32 x, u32 m) {
+#if defined(__BMI2__)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u32(x, m);
+#endif
+
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u32 m0, mk, mp, mv, t;
+    u32 array[5];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 4; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+}
+
+static really_inline
+u64a expand64(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(__BMI2__)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u64(x, m);
+#endif
+
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u64a m0, mk, mp, mv, t;
+    u64a array[6];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mp = mp ^ (mp << 32);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 5; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64(bitfield);
+}
+
+static really_inline
+char bf64_set(u64a *bitfield, u32 i) {
+    assert(i < 64);
+    u64a mask = 1ULL << i;
+    char was_set = !!(*bitfield & mask);
+    *bitfield |= mask;
+
+    return was_set;
+}
+
+static really_inline
+void bf64_unset(u64a *bitfield, u32 i) {
+    assert(i < 64);
+    *bitfield &= ~(1ULL << i);
+}
+
+#endif // BITUTILS_H