avx512: add basic functions to simd_utils

Extends the m512 type to use avx512 and also changes required for limex.
2026-01-02 14:44:41 +03:00 · 2016-07-20 11:31:34 +10:00
parent fedd48489f
commit 8a56d16d57
11 changed files with 258 additions and 53 deletions
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -151,18 +151,20 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
    DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
    m512 accelPerm = limex->accelPermute;
    m512 accelComp = limex->accelCompare;
-#if !defined(HAVE_AVX2)
+#if defined(HAVE_AVX512)
+    idx = packedExtract512(s, accelPerm, accelComp);
+#elif defined(HAVE_AVX2)
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
    u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
    assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
    idx = idx1 | idx2 | idx3 | idx4;
-#else
-    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
-    assert((idx1 & idx2) == 0); // should be no shared bits
-    idx = idx1 | idx2;
 #endif
    return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -62,4 +62,17 @@ u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
 }
 #endif // AVX2

+#if defined(HAVE_AVX512)
+static really_inline
+u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m512 shuffled = pshufb_m512(s, permute);
+    m512 compared = and512(shuffled, compare);
+    u64a rv = ~eq512mask(compared, shuffled);
+    // stitch the lane-wise results back together
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+#endif // AVX512
+
 #endif // LIMEX_SHUFFLE_H
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -401,7 +401,7 @@ const char *NFATraits<SHENG_NFA>::name = "Sheng";
 template<> struct NFATraits<TAMARAMA_NFA> {
    UNUSED static const char *name;
    static const NFACategory category = NFA_OTHER;
-    static const u32 stateAlign = 32;
+    static const u32 stateAlign = 64;
    static const bool fast = true;
    static const nfa_dispatch_fn has_accel;
    static const nfa_dispatch_fn has_repeats;