rose: simplify long lit table, add bloom filter

Replaces the original long lit hash table (used in streaming mode) with a smaller, simpler linear probing approach. Adds a bloom filter in front of it to reduce time spent on false positives. Sizing of both the hash table and bloom filter are done based on max load.
2025-10-10 00:02:24 +03:00 · 2016-09-22 13:58:42 +10:00
parent 68bf473e2e
commit 8869dee643
8 changed files with 520 additions and 435 deletions
--- a/src/rose/rose_build_long_lit.cpp
+++ b/src/rose/rose_build_long_lit.cpp
@@ -36,17 +36,28 @@
 #include "util/verify_types.h"
 #include "util/compile_context.h"

+#include <algorithm>
+#include <numeric>
+
 using namespace std;

 namespace ue2 {

-/** \brief Minimum size for a non-empty hash table. */
-static constexpr u32 MIN_HASH_TABLE_SIZE = 4096;
+/** \brief Minimum size for a non-empty hash table. Must be a power of two. */
+static constexpr u32 MIN_HASH_TABLE_SIZE = 128;
+
+/** \brief Maximum load factor (between zero and one) for a hash table. */
+static constexpr double MAX_HASH_TABLE_LOAD = 0.7;
+
+/** \brief Minimum size (in bits) for a bloom filter. Must be a power of two. */
+static constexpr u32 MIN_BLOOM_FILTER_SIZE = 256;
+
+/** \brief Maximum load factor (between zero and one) for a bloom filter. */
+static constexpr double MAX_BLOOM_FILTER_LOAD = 0.25;

 struct LongLitModeInfo {
-    u32 boundary = 0;    //!< One above the largest index for this mode.
-    u32 positions = 0;   //!< Total number of string positions.
-    u32 hashEntries = 0; //!< Number of hash table entries.
+    u32 num_literals = 0; //!< Number of strings for this mode.
+    u32 hashed_positions = 0; //!< Number of hashable string positions.
 };

 struct LongLitInfo {
@@ -66,54 +77,120 @@ static
 LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits,
                            size_t max_len) {
    LongLitInfo info;
-    u32 hashedPositionsCase = 0;
-    u32 hashedPositionsNocase = 0;
-
-    // Caseful boundary is the index of the first nocase literal, as we're
-    // ordered (caseful, nocase).
-    auto first_nocase = find_if(begin(lits), end(lits),
-                [](const ue2_case_string &lit) { return lit.nocase; });
-    info.caseful.boundary = verify_u32(distance(lits.begin(), first_nocase));
-
-    // Nocase boundary is the size of the literal set.
-    info.nocase.boundary = verify_u32(lits.size());

    for (const auto &lit : lits) {
-        if (lit.nocase) {
-            hashedPositionsNocase += lit.s.size() - max_len;
-            info.nocase.positions += lit.s.size();
-        } else {
-            hashedPositionsCase += lit.s.size() - max_len;
-            info.caseful.positions += lit.s.size();
-        }
+        auto &lit_info = lit.nocase ? info.nocase : info.caseful;
+        assert(lit.s.size() > max_len);
+        lit_info.num_literals++;
+        lit_info.hashed_positions += lit.s.size() - max_len;
    }

-    info.caseful.hashEntries = hashedPositionsCase
-        ? roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, hashedPositionsCase))
-        : 0;
-    info.nocase.hashEntries = hashedPositionsNocase
-        ? roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, hashedPositionsNocase))
-        : 0;
-
-    DEBUG_PRINTF("caseful:  boundary=%u, positions=%u, hashedPositions=%u, "
-                 "hashEntries=%u\n",
-                 info.caseful.boundary, info.caseful.positions,
-                 hashedPositionsCase, info.caseful.hashEntries);
-    DEBUG_PRINTF("nocase: boundary=%u, positions=%u, hashedPositions=%u, "
-                 "hashEntries=%u\n",
-                 info.nocase.boundary, info.nocase.positions,
-                 hashedPositionsNocase, info.nocase.hashEntries);
+    DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions);
+    DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions);

    return info;
 }

 static
-void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
-                RoseLongLitHashEntry *tab, size_t numEntries, bool nocase,
-                const map<u32, u32> &litToOffsetVal) {
-    const u32 nbits = lg2(numEntries);
-    map<u32, deque<pair<u32, u32>>> bucketToLitOffPairs;
-    map<u32, u64a> bucketToBitfield;
+void addToBloomFilter(vector<u8> &bloom, const u8 *substr, bool nocase) {
+    const u32 num_keys = verify_u32(bloom.size() * 8);
+    const u32 key_mask = (1U << lg2(num_keys)) -1;
+
+    const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 };
+    for (const auto &hash_func : hash_functions) {
+        u32 hash = hash_func(substr, nocase);
+        u32 key = hash & key_mask;
+        DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * 8);
+        bloom[key / 8] |= 1U << (key % 8);
+    }
+}
+
+static
+size_t bloomOccupancy(const vector<u8> &bloom) {
+    return accumulate(begin(bloom), end(bloom), 0,
+                      [](const size_t &sum, const u8 &elem) {
+                          return sum + popcount32(elem);
+                      });
+}
+
+static
+double bloomLoad(const vector<u8> &bloom) {
+    return (double)bloomOccupancy(bloom) / (double)(bloom.size() * 8);
+}
+
+static
+vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len,
+                            size_t num_entries, bool nocase) {
+    assert(num_entries % 8 == 0);
+    assert((num_entries & (num_entries - 1)) == 0); // Must be power of two.
+
+    vector<u8> bloom(num_entries / 8, 0);
+
+    if (!num_entries) {
+        return bloom;
+    }
+
+    for (const auto &lit : lits) {
+        if (nocase != lit.nocase) {
+            continue;
+        }
+        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
+            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
+            addToBloomFilter(bloom, substr, nocase);
+        }
+    }
+
+    DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", bloomOccupancy(bloom),
+                 num_entries);
+
+    return bloom;
+}
+
+
+static
+vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits,
+                           size_t max_len, bool nocase) {
+    vector<u8> bloom;
+
+    size_t num_entries = MIN_BLOOM_FILTER_SIZE;
+    for (;;) {
+        bloom = buildBloomFilter(lits, max_len, num_entries, nocase);
+        DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     bloomLoad(bloom));
+        if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return bloom;
+}
+
+static
+size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) {
+    return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) {
+        return ent.str_offset != 0;
+    });
+}
+
+static
+double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) {
+    return (double)hashTableOccupancy(tab) / (double)(tab.size());
+}
+
+static
+vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
+                                            size_t max_len,
+                                            const vector<u32> &litToOffsetVal,
+                                            size_t numEntries, bool nocase) {
+    vector<RoseLongLitHashEntry> tab(numEntries, {0,0});
+
+    if (!numEntries) {
+        return tab;
+    }
+
+    map<u32, vector<pair<u32, u32>>> hashToLitOffPairs;

    for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) {
        const ue2_case_string &lit = lits[lit_id];
@@ -122,37 +199,41 @@ void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
        }
        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
-            u32 h = hashLongLiteral(substr, max_len, lit.nocase);
-            u32 h_ent = h & ((1U << nbits) - 1);
-            u32 h_low = (h >> nbits) & 63;
-            bucketToLitOffPairs[h_ent].emplace_back(lit_id, offset);
-            bucketToBitfield[h_ent] |= (1ULL << h_low);
+            u32 hash = hashLongLiteral(substr, max_len, lit.nocase);
+            hashToLitOffPairs[hash].emplace_back(lit_id, offset);
        }
    }

-    // this used to be a set<u32>, but a bitset is much much faster given that
-    // we're using it only for membership testing.
-    boost::dynamic_bitset<> filledBuckets(numEntries); // all zero by default.
+    for (auto &m : hashToLitOffPairs) {
+        u32 hash = m.first;
+        vector<pair<u32, u32>> &d = m.second;

-    // sweep out bitfield entries and save the results swapped accordingly
-    // also, anything with bitfield entries is put in filledBuckets
-    for (const auto &m : bucketToBitfield) {
-        const u32 &bucket = m.first;
-        const u64a &contents = m.second;
-        tab[bucket].bitfield = contents;
-        filledBuckets.set(bucket);
-    }
+        // Sort by (offset, string) so that we'll be able to remove identical
+        // string prefixes.
+        stable_sort(begin(d), end(d),
+                    [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                        const auto &str_a = lits[a.first].s;
+                        const auto &str_b = lits[b.first].s;
+                        return tie(a.second, str_a) < tie(b.second, str_b);
+                    });

-    // store out all our chains based on free values in our hash table.
-    // find nearest free locations that are empty (there will always be more
-    // entries than strings, at present)
-    for (auto &m : bucketToLitOffPairs) {
-        u32 bucket = m.first;
-        deque<pair<u32, u32>> &d = m.second;
+        // Remove entries that point to the same literal prefix.
+        d.erase(unique(begin(d), end(d),
+                       [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                           if (a.second != b.second) {
+                               return false;
+                           }
+                           const auto &str_a = lits[a.first].s;
+                           const auto &str_b = lits[b.first].s;
+                           const size_t len = max_len + a.second;
+                           return equal(begin(str_a), begin(str_a) + len,
+                                        begin(str_b));
+                       }),
+                end(d));

-        // sort d by distance of the residual string (len minus our depth into
-        // the string). We need to put the 'furthest back' string first...
-        stable_sort(d.begin(), d.end(),
+        // Sort d by distance of the residual string (len minus our depth into
+        // the string). We need to put the 'furthest back' string first.
+        stable_sort(begin(d), end(d),
                    [](const pair<u32, u32> &a, const pair<u32, u32> &b) {
                        if (a.second != b.second) {
                            return a.second > b.second; /* longest is first */
@@ -160,47 +241,79 @@ void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
                        return a.first < b.first;
                    });

-        while (1) {
-            // first time through is always at bucket, then we fill in links
-            filledBuckets.set(bucket);
-            RoseLongLitHashEntry *ent = &tab[bucket];
-            u32 lit_id = d.front().first;
-            u32 offset = d.front().second;
+        u32 bucket = hash % numEntries;

-            ent->state = verify_u32(litToOffsetVal.at(lit_id) +
-                                    offset + max_len);
-            ent->link = (u32)LINK_INVALID;
-
-            d.pop_front();
-            if (d.empty()) {
-                break;
-            }
-            // now, if there is another value
-            // find a bucket for it and put in 'bucket' and repeat
-            // all we really need to do is find something not in filledBuckets,
-            // ideally something close to bucket
-            // we search backward and forward from bucket, trying to stay as
-            // close as possible.
-            UNUSED bool found = false;
-            int bucket_candidate = 0;
-            for (u32 k = 1; k < numEntries * 2; k++) {
-                bucket_candidate = bucket + (((k & 1) == 0)
-                        ? (-(int)k / 2) : (k / 2));
-                if (bucket_candidate < 0 ||
-                    (size_t)bucket_candidate >= numEntries) {
-                    continue;
-                }
-                if (!filledBuckets.test(bucket_candidate)) {
-                    found = true;
-                    break;
+        // Placement via linear probing.
+        for (const auto &lit_offset : d) {
+            while (tab[bucket].str_offset != 0) {
+                bucket++;
+                if (bucket == numEntries) {
+                    bucket = 0;
                }
            }

-            assert(found);
-            bucket = bucket_candidate;
-            ent->link = bucket;
+            u32 lit_id = lit_offset.first;
+            u32 offset = lit_offset.second;
+
+            DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
+                         lit_id, offset, bucket);
+
+            auto &entry = tab[bucket];
+            entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
+            assert(entry.str_offset != 0);
+            entry.str_len = offset + max_len;
        }
    }
+
+    DEBUG_PRINTF("%s hash table occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", hashTableOccupancy(tab),
+                 numEntries);
+
+    return tab;
+}
+
+static
+vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits,
+                                           size_t max_len,
+                                           const vector<u32> &litToOffsetVal,
+                                           u32 numPositions, bool nocase) {
+    vector<RoseLongLitHashEntry> tab;
+
+    // Note: for the hash table, we must always have at least enough entries
+    // for the number of hashable positions.
+    size_t num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE,
+    numPositions));
+
+    for (;;) {
+        tab = buildHashTable(lits, max_len, litToOffsetVal, num_entries,
+                             nocase);
+        DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     hashTableLoad(tab));
+        if (hashTableLoad(tab) < MAX_HASH_TABLE_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return tab;
+}
+
+static
+vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset,
+                     vector<u32> &litToOffsetVal) {
+    vector<u8> blob;
+    litToOffsetVal.resize(lits.size(), 0);
+
+    u32 lit_id = 0;
+    for (const auto &lit : lits) {
+        u32 offset = baseOffset + verify_u32(blob.size());
+        blob.insert(blob.end(), begin(lit.s), end(lit.s));
+        litToOffsetVal[lit_id] = offset;
+        lit_id++;
+    }
+
+    DEBUG_PRINTF("built %zu bytes of strings\n", blob.size());
+    return blob;
 }

 u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
@@ -251,89 +364,69 @@ u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,

    LongLitInfo info = analyzeLongLits(lits, max_len);

-    // first assess the size and find our caseless threshold
-    size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
+    vector<u32> litToOffsetVal;
+    const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
+    vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal);

-    size_t litTabOffset = headerSize;
+    // Build caseful bloom filter and hash table.
+    vector<u8> bloom_case;
+    vector<RoseLongLitHashEntry> tab_case;
+    if (info.caseful.num_literals) {
+        bloom_case = makeBloomFilter(lits, max_len, false);
+        tab_case = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.caseful.hashed_positions, false);
+    }

-    size_t litTabNumEntries = lits.size() + 1;
-    size_t litTabSize = ROUNDUP_16(litTabNumEntries * sizeof(RoseLongLiteral));
+    // Build nocase bloom filter and hash table.
+    vector<u8> bloom_nocase;
+    vector<RoseLongLitHashEntry> tab_nocase;
+    if (info.nocase.num_literals) {
+        bloom_nocase = makeBloomFilter(lits, max_len, true);
+        tab_nocase = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.nocase.hashed_positions, true);
+    }

-    size_t wholeLitTabOffset = litTabOffset + litTabSize;
-    size_t totalWholeLitTabSize =
-        ROUNDUP_16(info.caseful.positions + info.nocase.positions);
+    size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob));
+    size_t htOffsetCase = headerSize + wholeLitTabSize;
+    size_t htOffsetNocase = htOffsetCase + byte_length(tab_case);
+    size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase);
+    size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case);

-    size_t htOffsetCase = wholeLitTabOffset + totalWholeLitTabSize;
-    size_t htSizeCase = info.caseful.hashEntries * sizeof(RoseLongLitHashEntry);
-    size_t htOffsetNocase = htOffsetCase + htSizeCase;
-    size_t htSizeNocase =
-        info.nocase.hashEntries * sizeof(RoseLongLitHashEntry);
-
-    size_t tabSize = ROUNDUP_16(htOffsetNocase + htSizeNocase);
+    size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase));

    // need to add +2 to both of these to allow space for the actual largest
    // value as well as handling the fact that we add one to the space when
    // storing out a position to allow zero to mean "no stream state value"
-    u8 streamBitsCase = lg2(roundUpToPowerOfTwo(info.caseful.positions + 2));
-    u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(info.nocase.positions + 2));
+    u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + 2));
+    u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2));
    u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8;

    auto table = aligned_zmalloc_unique<char>(tabSize);
    assert(table); // otherwise would have thrown std::bad_alloc

-    // then fill it in
-    char *ptr = table.get();
-    RoseLongLitTable *header = (RoseLongLitTable *)ptr;
-    // fill in header
+    // Fill in the RoseLongLitTable header structure.
+    RoseLongLitTable *header = (RoseLongLitTable *)(table.get());
+    header->size = verify_u32(tabSize);
    header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
-    header->boundaryCase = info.caseful.boundary;
-    header->hashOffsetCase = verify_u32(htOffsetCase);
-    header->hashNBitsCase = lg2(info.caseful.hashEntries);
-    header->streamStateBitsCase = streamBitsCase;
-    header->boundaryNocase = info.nocase.boundary;
-    header->hashOffsetNocase = verify_u32(htOffsetNocase);
-    header->hashNBitsNocase = lg2(info.nocase.hashEntries);
-    header->streamStateBitsNocase = streamBitsNocase;
+    header->caseful.hashOffset = verify_u32(htOffsetCase);
+    header->caseful.hashBits = lg2(tab_case.size());
+    header->caseful.streamStateBits = streamBitsCase;
+    header->caseful.bloomOffset = verify_u32(bloomOffsetCase);
+    header->caseful.bloomBits = lg2(bloom_case.size() * 8);
+    header->nocase.hashOffset = verify_u32(htOffsetNocase);
+    header->nocase.hashBits = lg2(tab_nocase.size());
+    header->nocase.streamStateBits = streamBitsNocase;
+    header->nocase.bloomOffset = verify_u32(bloomOffsetNocase);
+    header->nocase.bloomBits = lg2(bloom_nocase.size() * 8);
    assert(tot_state_bytes < sizeof(u64a));
    header->streamStateBytes = verify_u8(tot_state_bytes); // u8

-    ptr += headerSize;
-
-    // now fill in the rest
-
-    RoseLongLiteral *litTabPtr = (RoseLongLiteral *)ptr;
-    ptr += litTabSize;
-
-    map<u32, u32> litToOffsetVal;
-    for (auto i = lits.begin(), e = lits.end(); i != e; ++i) {
-        u32 entry = verify_u32(i - lits.begin());
-        u32 offset = verify_u32(ptr - table.get());
-
-        // point the table entry to the string location
-        litTabPtr[entry].offset = offset;
-
-        litToOffsetVal[entry] = offset;
-
-        // copy the string into the string location
-        const auto &s = i->s;
-        memcpy(ptr, s.c_str(), s.size());
-
-        ptr += s.size(); // and the string location
-    }
-
-    // fill in final lit table entry with current ptr (serves as end value)
-    litTabPtr[lits.size()].offset = verify_u32(ptr - table.get());
-
-    // fill hash tables
-    ptr = table.get() + htOffsetCase;
-    fillHashes(lits, max_len, (RoseLongLitHashEntry *)ptr,
-               info.caseful.hashEntries, false, litToOffsetVal);
-    ptr += htSizeCase;
-    fillHashes(lits, max_len, (RoseLongLitHashEntry *)ptr,
-               info.nocase.hashEntries, true, litToOffsetVal);
-    ptr += htSizeNocase;
-
-    assert(ptr <= table.get() + tabSize);
+    // Copy in the literal strings, hash tables and bloom filters,
+    copy_bytes(table.get() + headerSize, lit_blob);
+    copy_bytes(table.get() + htOffsetCase, tab_case);
+    copy_bytes(table.get() + bloomOffsetCase, bloom_case);
+    copy_bytes(table.get() + htOffsetNocase, tab_nocase);
+    copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase);

    DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize);
    DEBUG_PRINTF("requires %zu bytes of history\n", max_len);