move andn helper function to bitutils.h

2025-06-28 16:41:01 +03:00 · 2020-09-22 12:17:27 +03:00 · 2020-09-22 12:17:27 +03:00 · 9f3ad89ed6
commit 9f3ad89ed6
parent 6581aae90e
4 changed files with 32 additions and 14 deletions
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -36,6 +36,7 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/arch.h"
+#include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"

@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };

-/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
- * so we force its generation.
- */
-static really_inline
-u64a andn(const u32 a, const u8 *b) {
-    u64a r;
-#if defined(HAVE_BMI) && !defined(NO_ASM)
-    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
-#else
-    r = unaligned_load_u32(b) & ~a;
-#endif
-    return r;
-}
-
 /* generates an initial state mask based on the last byte-ish of history rather
 * than being all accepting. If there is no history to consider, the state is
 * generated based on the minimum length of each bucket in order to prevent
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@ -34,6 +34,7 @@
 #define BITUTILS_ARCH_COMMON_H

 #include "util/popcount.h"
+#include "util/unaligned.h"

 static really_inline
 u32 clz32_impl_c(u32 x) {
@ -350,4 +351,12 @@ u64a pext64_impl_c(u64a x, u64a mask) {
    return result;
 }

+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl_c(const u32 a, const u8 *b) {
+    return unaligned_load_u32(b) & ~a;
+}
+
 #endif // BITUTILS_ARCH_COMMON_H
--- a/src/util/arch/x86/bitutils.h
+++ b/src/util/arch/x86/bitutils.h
@ -301,4 +301,18 @@ u64a pdep64(u64a x, u64a mask) {
 }
 #endif

+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    u64a r;
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+    return r;
+#else
+    return andn_impl_c(a, b);
+#endif
+}
+
 #endif // BITUTILS_ARCH_X86_H
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@ -167,4 +167,12 @@ u64a pext64(u64a x, u64a mask) {
    return pext64_impl(x, mask);
 }

+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
+}
+
 #endif // BITUTILS_H