Rose: allow block-mode merge of small prefixes

Previously, we disallowed the merging of all Rose prefixes in block mode where the literal sets are not identical. This change allows merging if the prefix graphs to be merged are very small, as a small performance improvement for cases with lots of tiny prefixes. This check is deliberately conservative: graphs must have some common vertices, and the result of the merge must not give up any accelerability.
2025-06-28 16:41:01 +03:00 · 2016-02-09 10:01:53 +11:00 · 2016-02-09 10:01:53 +11:00 · 7b54856642
commit 7b54856642
parent 670eff5bc0
3 changed files with 112 additions and 37 deletions
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -2187,7 +2187,7 @@ u32 countAccelStates(NGHolder &h,

    if (!cc.grey.allowLimExNFA) {
        DEBUG_PRINTF("limex not allowed\n");
-        return NFA_MAX_ACCEL_STATES + 1;
+        return 0;
    }

    // Sanity check the input data.
@ -2201,11 +2201,11 @@ u32 countAccelStates(NGHolder &h,
                  do_accel, state_compression, cc, num_states);

    // Acceleration analysis.
-    fillAccelInfo(bi);
+    nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);

-    u32 num_accel = verify_u32(bi.accel.accelerable.size());
+    u32 num_accel = verify_u32(bi.accel.accel_map.size());
    DEBUG_PRINTF("found %u accel states\n", num_accel);
-    return min(num_accel, (u32)NFA_MAX_ACCEL_STATES);
+    return num_accel;
 }

 } // namespace ue2
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -79,11 +79,10 @@ aligned_unique_ptr<NFA> generate(NGHolder &g,
                        const CompileContext &cc);

 /**
- * \brief For a given graph, count the number of accel states it will have in
- * an implementation.
+ * \brief For a given graph, count the number of accelerable states it has.
 *
- * \return the number of accel states, or NFA_MAX_ACCEL_STATES + 1 if an
- * implementation would not be constructible.
+ * Note that this number may be greater than the number that are actually
+ * implementable.
 */
 u32 countAccelStates(NGHolder &h,
                     const ue2::unordered_map<NFAVertex, u32> &states,
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -105,6 +105,10 @@ static const size_t DFA_MERGE_MAX_STATES = 8000;
 * merging with other graphs. */
 static const size_t LARGE_LBR_MIN_VERTICES = 32;

+/** \brief In block mode, merge two prefixes even if they don't have identical
+ * literal sets if they have fewer than this many states and the merged graph
+ * is also small. */
+static constexpr size_t MAX_BLOCK_PREFIX_MERGE_VERTICES = 32;

 static
 size_t small_merge_max_vertices(const CompileContext &cc) {
@ -930,6 +934,99 @@ bool compatibleLiteralsForMerge(
    return true;
 }

+/**
+ * True if this graph has few enough accel states to be implemented as an NFA
+ * with all of those states actually becoming accel schemes.
+ */
+static
+bool isAccelerableLeftfix(const RoseBuildImpl &build, const NGHolder &g) {
+    u32 num = countAccelStates(g, &build.rm, build.cc);
+    DEBUG_PRINTF("graph with %zu vertices has %u accel states\n",
+                  num_vertices(g), num);
+    return num <= NFA_MAX_ACCEL_STATES;
+}
+
+/**
+ * In block mode, we want to be a little more selective, We will only merge
+ * prefix engines when the literal sets are the same, or if the merged graph
+ * has only grown by a small amount.
+ */
+static
+bool safeBlockModeMerge(const RoseBuildImpl &build, RoseVertex u,
+                        RoseVertex v) {
+    assert(!build.cc.streaming);
+    assert(build.isRootSuccessor(u) == build.isRootSuccessor(v));
+
+    // Always merge infixes if we can (subject to the other criteria in
+    // mergeableRoseVertices).
+    if (!build.isRootSuccessor(u)) {
+        return true;
+    }
+
+    const RoseGraph &g = build.g;
+
+    // Merge prefixes with identical literal sets (as we'd have to run them
+    // both when we see those literals anyway).
+    if (g[u].literals == g[v].literals) {
+        return true;
+    }
+
+    // The rest of this function only deals with the case when both vertices
+    // have graph leftfixes.
+    if (!g[u].left.graph || !g[v].left.graph) {
+        return false;
+    }
+
+    const size_t u_count = num_vertices(*g[u].left.graph);
+    const size_t v_count = num_vertices(*g[v].left.graph);
+    DEBUG_PRINTF("u prefix has %zu vertices, v prefix has %zu vertices\n",
+                 u_count, v_count);
+    if (u_count > MAX_BLOCK_PREFIX_MERGE_VERTICES ||
+        v_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
+        DEBUG_PRINTF("prefixes too big already\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("trying merge\n");
+    NGHolder h;
+    cloneHolder(h, *g[v].left.graph);
+    if (!mergeNfaPair(*g[u].left.graph, h, nullptr, build.cc)) {
+        DEBUG_PRINTF("couldn't merge\n");
+        return false;
+    }
+
+    const size_t merged_count = num_vertices(h);
+    DEBUG_PRINTF("merged result has %zu vertices\n", merged_count);
+    if (merged_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
+        DEBUG_PRINTF("exceeded limit\n");
+        return false;
+    }
+
+    // We want to only perform merges that take advantage of some
+    // commonality in the two input graphs, so we check that the number of
+    // vertices has only grown a small amount: somewhere between the sum
+    // (no commonality) and the max (no growth at all) of the vertex counts
+    // of the input graphs.
+    const size_t max_size = u_count + v_count;
+    const size_t min_size = max(u_count, v_count);
+    const size_t max_growth = ((max_size - min_size) * 25) / 100;
+    if (merged_count > min_size + max_growth) {
+        DEBUG_PRINTF("grew too much\n");
+        return false;
+    }
+
+    // We don't want to squander any chances at accelerating.
+    if (!isAccelerableLeftfix(build, h) &&
+        (isAccelerableLeftfix(build, *g[u].left.graph) ||
+         isAccelerableLeftfix(build, *g[v].left.graph))) {
+        DEBUG_PRINTF("would lose accel property\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("safe to merge\n");
+    return true;
+}
+
 bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
                           RoseVertex v) {
    assert(u != v);
@ -938,15 +1035,8 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
        return false;
    }

-    // UE-1675: in block mode, we want to be a little more selective -- only
-    // merge prefix roses when the literal sets are the same.
-    if (!tbi.cc.streaming && tbi.isRootSuccessor(u)) {
-        assert(tbi.isRootSuccessor(v));
-
-        if (tbi.g[u].literals != tbi.g[v].literals) {
-            DEBUG_PRINTF("literals aren't identical (block mode prefix)\n");
-            return false;
-        }
+    if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u, v)) {
+        return false;
    }

    /* We cannot merge prefixes/vertices if they are successors of different
@ -1102,15 +1192,8 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi,

    vector<pair<const rose_literal_id *, u32>> ulits; /* lit + lag pairs */
    for (auto a : verts1) {
-        // UE-1675: in block mode, we want to be a little more selective --
-        // only merge prefix roses when the literal sets are the same.
-        if (!tbi.cc.streaming && is_prefix) {
-            assert(tbi.isRootSuccessor(a));
-
-            if (tbi.g[u_front].literals != tbi.g[a].literals) {
-                DEBUG_PRINTF("literals aren't identical (block mode prefix)\n");
-                return false;
-            }
+        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u_front, a)) {
+            return false;
        }

        u32 ulag = tbi.g[a].left.lag;
@ -1121,15 +1204,8 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi,

    vector<pair<const rose_literal_id *, u32>> vlits;
    for (auto a : verts2) {
-        // UE-1675: in block mode, we want to be a little more selective --
-        // only merge prefix roses when the literal sets are the same.
-        if (!tbi.cc.streaming && is_prefix) {
-            assert(tbi.isRootSuccessor(a));
-
-            if (tbi.g[u_front].literals != tbi.g[a].literals) {
-                DEBUG_PRINTF("literals aren't identical (block mode prefix)\n");
-                return false;
-            }
+        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u_front, a)) {
+            return false;
        }

        u32 vlag = tbi.g[a].left.lag;