rose: simplify long lit table, add bloom filter

Replaces the original long lit hash table (used in streaming mode) with a smaller, simpler linear probing approach. Adds a bloom filter in front of it to reduce time spent on false positives. Sizing of both the hash table and bloom filter are done based on max load.
2025-06-28 16:41:01 +03:00 · 2016-09-22 13:58:42 +10:00 · 2016-09-22 13:58:42 +10:00 · 8869dee643
commit 8869dee643
parent 68bf473e2e
8 changed files with 520 additions and 435 deletions
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@ -4351,6 +4351,7 @@ void makeCheckLiteralInstruction(const RoseBuildImpl &build,
    if (lit.table != ROSE_FLOATING) {
        return;
    }
+    assert(bc.longLitLengthThreshold > 0);
    if (lit.s.length() <= bc.longLitLengthThreshold) {
        return;
    }
@ -4937,6 +4938,8 @@ void allocateFinalIdToSet(RoseBuildImpl &build, const set<u32> &lits,
     * ids and squash the same roles and have the same group squashing
     * behaviour. Benefits literals cannot be merged. */

+    assert(longLitLengthThreshold > 0);
+
    for (u32 int_id : lits) {
        rose_literal_info &curr_info = literal_info[int_id];
        const rose_literal_id &lit = build.literals.right.at(int_id);
--- a/src/rose/rose_build_long_lit.cpp
+++ b/src/rose/rose_build_long_lit.cpp
@ -36,17 +36,28 @@
 #include "util/verify_types.h"
 #include "util/compile_context.h"

+#include <algorithm>
+#include <numeric>
+
 using namespace std;

 namespace ue2 {

-/** \brief Minimum size for a non-empty hash table. */
-static constexpr u32 MIN_HASH_TABLE_SIZE = 4096;
+/** \brief Minimum size for a non-empty hash table. Must be a power of two. */
+static constexpr u32 MIN_HASH_TABLE_SIZE = 128;
+
+/** \brief Maximum load factor (between zero and one) for a hash table. */
+static constexpr double MAX_HASH_TABLE_LOAD = 0.7;
+
+/** \brief Minimum size (in bits) for a bloom filter. Must be a power of two. */
+static constexpr u32 MIN_BLOOM_FILTER_SIZE = 256;
+
+/** \brief Maximum load factor (between zero and one) for a bloom filter. */
+static constexpr double MAX_BLOOM_FILTER_LOAD = 0.25;

 struct LongLitModeInfo {
-    u32 boundary = 0;    //!< One above the largest index for this mode.
-    u32 positions = 0;   //!< Total number of string positions.
-    u32 hashEntries = 0; //!< Number of hash table entries.
+    u32 num_literals = 0; //!< Number of strings for this mode.
+    u32 hashed_positions = 0; //!< Number of hashable string positions.
 };

 struct LongLitInfo {
@ -66,54 +77,120 @@ static
 LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits,
                            size_t max_len) {
    LongLitInfo info;
-    u32 hashedPositionsCase = 0;
-    u32 hashedPositionsNocase = 0;
-
-    // Caseful boundary is the index of the first nocase literal, as we're
-    // ordered (caseful, nocase).
-    auto first_nocase = find_if(begin(lits), end(lits),
-                [](const ue2_case_string &lit) { return lit.nocase; });
-    info.caseful.boundary = verify_u32(distance(lits.begin(), first_nocase));
-
-    // Nocase boundary is the size of the literal set.
-    info.nocase.boundary = verify_u32(lits.size());

    for (const auto &lit : lits) {
-        if (lit.nocase) {
-            hashedPositionsNocase += lit.s.size() - max_len;
-            info.nocase.positions += lit.s.size();
-        } else {
-            hashedPositionsCase += lit.s.size() - max_len;
-            info.caseful.positions += lit.s.size();
-        }
+        auto &lit_info = lit.nocase ? info.nocase : info.caseful;
+        assert(lit.s.size() > max_len);
+        lit_info.num_literals++;
+        lit_info.hashed_positions += lit.s.size() - max_len;
    }

-    info.caseful.hashEntries = hashedPositionsCase
-        ? roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, hashedPositionsCase))
-        : 0;
-    info.nocase.hashEntries = hashedPositionsNocase
-        ? roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, hashedPositionsNocase))
-        : 0;
-
-    DEBUG_PRINTF("caseful:  boundary=%u, positions=%u, hashedPositions=%u, "
-                 "hashEntries=%u\n",
-                 info.caseful.boundary, info.caseful.positions,
-                 hashedPositionsCase, info.caseful.hashEntries);
-    DEBUG_PRINTF("nocase: boundary=%u, positions=%u, hashedPositions=%u, "
-                 "hashEntries=%u\n",
-                 info.nocase.boundary, info.nocase.positions,
-                 hashedPositionsNocase, info.nocase.hashEntries);
+    DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions);
+    DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions);

    return info;
 }

 static
-void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
-                RoseLongLitHashEntry *tab, size_t numEntries, bool nocase,
-                const map<u32, u32> &litToOffsetVal) {
-    const u32 nbits = lg2(numEntries);
-    map<u32, deque<pair<u32, u32>>> bucketToLitOffPairs;
-    map<u32, u64a> bucketToBitfield;
+void addToBloomFilter(vector<u8> &bloom, const u8 *substr, bool nocase) {
+    const u32 num_keys = verify_u32(bloom.size() * 8);
+    const u32 key_mask = (1U << lg2(num_keys)) -1;
+
+    const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 };
+    for (const auto &hash_func : hash_functions) {
+        u32 hash = hash_func(substr, nocase);
+        u32 key = hash & key_mask;
+        DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * 8);
+        bloom[key / 8] |= 1U << (key % 8);
+    }
+}
+
+static
+size_t bloomOccupancy(const vector<u8> &bloom) {
+    return accumulate(begin(bloom), end(bloom), 0,
+                      [](const size_t &sum, const u8 &elem) {
+                          return sum + popcount32(elem);
+                      });
+}
+
+static
+double bloomLoad(const vector<u8> &bloom) {
+    return (double)bloomOccupancy(bloom) / (double)(bloom.size() * 8);
+}
+
+static
+vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len,
+                            size_t num_entries, bool nocase) {
+    assert(num_entries % 8 == 0);
+    assert((num_entries & (num_entries - 1)) == 0); // Must be power of two.
+
+    vector<u8> bloom(num_entries / 8, 0);
+
+    if (!num_entries) {
+        return bloom;
+    }
+
+    for (const auto &lit : lits) {
+        if (nocase != lit.nocase) {
+            continue;
+        }
+        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
+            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
+            addToBloomFilter(bloom, substr, nocase);
+        }
+    }
+
+    DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", bloomOccupancy(bloom),
+                 num_entries);
+
+    return bloom;
+}
+
+
+static
+vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits,
+                           size_t max_len, bool nocase) {
+    vector<u8> bloom;
+
+    size_t num_entries = MIN_BLOOM_FILTER_SIZE;
+    for (;;) {
+        bloom = buildBloomFilter(lits, max_len, num_entries, nocase);
+        DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     bloomLoad(bloom));
+        if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return bloom;
+}
+
+static
+size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) {
+    return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) {
+        return ent.str_offset != 0;
+    });
+}
+
+static
+double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) {
+    return (double)hashTableOccupancy(tab) / (double)(tab.size());
+}
+
+static
+vector<RoseLongLitHashEntry> buildHashTable(const vector<ue2_case_string> &lits,
+                                            size_t max_len,
+                                            const vector<u32> &litToOffsetVal,
+                                            size_t numEntries, bool nocase) {
+    vector<RoseLongLitHashEntry> tab(numEntries, {0,0});
+
+    if (!numEntries) {
+        return tab;
+    }
+
+    map<u32, vector<pair<u32, u32>>> hashToLitOffPairs;

    for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) {
        const ue2_case_string &lit = lits[lit_id];
@ -122,37 +199,41 @@ void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
        }
        for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) {
            const u8 *substr = (const u8 *)lit.s.c_str() + offset;
-            u32 h = hashLongLiteral(substr, max_len, lit.nocase);
-            u32 h_ent = h & ((1U << nbits) - 1);
-            u32 h_low = (h >> nbits) & 63;
-            bucketToLitOffPairs[h_ent].emplace_back(lit_id, offset);
-            bucketToBitfield[h_ent] |= (1ULL << h_low);
+            u32 hash = hashLongLiteral(substr, max_len, lit.nocase);
+            hashToLitOffPairs[hash].emplace_back(lit_id, offset);
        }
    }

-    // this used to be a set<u32>, but a bitset is much much faster given that
-    // we're using it only for membership testing.
-    boost::dynamic_bitset<> filledBuckets(numEntries); // all zero by default.
+    for (auto &m : hashToLitOffPairs) {
+        u32 hash = m.first;
+        vector<pair<u32, u32>> &d = m.second;

-    // sweep out bitfield entries and save the results swapped accordingly
-    // also, anything with bitfield entries is put in filledBuckets
-    for (const auto &m : bucketToBitfield) {
-        const u32 &bucket = m.first;
-        const u64a &contents = m.second;
-        tab[bucket].bitfield = contents;
-        filledBuckets.set(bucket);
-    }
+        // Sort by (offset, string) so that we'll be able to remove identical
+        // string prefixes.
+        stable_sort(begin(d), end(d),
+                    [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                        const auto &str_a = lits[a.first].s;
+                        const auto &str_b = lits[b.first].s;
+                        return tie(a.second, str_a) < tie(b.second, str_b);
+                    });

-    // store out all our chains based on free values in our hash table.
-    // find nearest free locations that are empty (there will always be more
-    // entries than strings, at present)
-    for (auto &m : bucketToLitOffPairs) {
-        u32 bucket = m.first;
-        deque<pair<u32, u32>> &d = m.second;
+        // Remove entries that point to the same literal prefix.
+        d.erase(unique(begin(d), end(d),
+                       [&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
+                           if (a.second != b.second) {
+                               return false;
+                           }
+                           const auto &str_a = lits[a.first].s;
+                           const auto &str_b = lits[b.first].s;
+                           const size_t len = max_len + a.second;
+                           return equal(begin(str_a), begin(str_a) + len,
+                                        begin(str_b));
+                       }),
+                end(d));

-        // sort d by distance of the residual string (len minus our depth into
-        // the string). We need to put the 'furthest back' string first...
-        stable_sort(d.begin(), d.end(),
+        // Sort d by distance of the residual string (len minus our depth into
+        // the string). We need to put the 'furthest back' string first.
+        stable_sort(begin(d), end(d),
                    [](const pair<u32, u32> &a, const pair<u32, u32> &b) {
                        if (a.second != b.second) {
                            return a.second > b.second; /* longest is first */
@ -160,47 +241,79 @@ void fillHashes(const vector<ue2_case_string> &lits, size_t max_len,
                        return a.first < b.first;
                    });

-        while (1) {
-            // first time through is always at bucket, then we fill in links
-            filledBuckets.set(bucket);
-            RoseLongLitHashEntry *ent = &tab[bucket];
-            u32 lit_id = d.front().first;
-            u32 offset = d.front().second;
+        u32 bucket = hash % numEntries;

-            ent->state = verify_u32(litToOffsetVal.at(lit_id) +
-                                    offset + max_len);
-            ent->link = (u32)LINK_INVALID;
-
-            d.pop_front();
-            if (d.empty()) {
-                break;
-            }
-            // now, if there is another value
-            // find a bucket for it and put in 'bucket' and repeat
-            // all we really need to do is find something not in filledBuckets,
-            // ideally something close to bucket
-            // we search backward and forward from bucket, trying to stay as
-            // close as possible.
-            UNUSED bool found = false;
-            int bucket_candidate = 0;
-            for (u32 k = 1; k < numEntries * 2; k++) {
-                bucket_candidate = bucket + (((k & 1) == 0)
-                        ? (-(int)k / 2) : (k / 2));
-                if (bucket_candidate < 0 ||
-                    (size_t)bucket_candidate >= numEntries) {
-                    continue;
-                }
-                if (!filledBuckets.test(bucket_candidate)) {
-                    found = true;
-                    break;
+        // Placement via linear probing.
+        for (const auto &lit_offset : d) {
+            while (tab[bucket].str_offset != 0) {
+                bucket++;
+                if (bucket == numEntries) {
+                    bucket = 0;
                }
            }

-            assert(found);
-            bucket = bucket_candidate;
-            ent->link = bucket;
+            u32 lit_id = lit_offset.first;
+            u32 offset = lit_offset.second;
+
+            DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
+                         lit_id, offset, bucket);
+
+            auto &entry = tab[bucket];
+            entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
+            assert(entry.str_offset != 0);
+            entry.str_len = offset + max_len;
        }
    }
+
+    DEBUG_PRINTF("%s hash table occupancy %zu of %zu entries\n",
+                 nocase ? "nocase" : "caseful", hashTableOccupancy(tab),
+                 numEntries);
+
+    return tab;
+}
+
+static
+vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits,
+                                           size_t max_len,
+                                           const vector<u32> &litToOffsetVal,
+                                           u32 numPositions, bool nocase) {
+    vector<RoseLongLitHashEntry> tab;
+
+    // Note: for the hash table, we must always have at least enough entries
+    // for the number of hashable positions.
+    size_t num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE,
+    numPositions));
+
+    for (;;) {
+        tab = buildHashTable(lits, max_len, litToOffsetVal, num_entries,
+                             nocase);
+        DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
+                     nocase ? "nocase" : "caseful", num_entries,
+                     hashTableLoad(tab));
+        if (hashTableLoad(tab) < MAX_HASH_TABLE_LOAD) {
+            break;
+        }
+        num_entries *= 2;
+    }
+    return tab;
+}
+
+static
+vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset,
+                     vector<u32> &litToOffsetVal) {
+    vector<u8> blob;
+    litToOffsetVal.resize(lits.size(), 0);
+
+    u32 lit_id = 0;
+    for (const auto &lit : lits) {
+        u32 offset = baseOffset + verify_u32(blob.size());
+        blob.insert(blob.end(), begin(lit.s), end(lit.s));
+        litToOffsetVal[lit_id] = offset;
+        lit_id++;
+    }
+
+    DEBUG_PRINTF("built %zu bytes of strings\n", blob.size());
+    return blob;
 }

 u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
@ -251,89 +364,69 @@ u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,

    LongLitInfo info = analyzeLongLits(lits, max_len);

-    // first assess the size and find our caseless threshold
-    size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
+    vector<u32> litToOffsetVal;
+    const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
+    vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal);

-    size_t litTabOffset = headerSize;
+    // Build caseful bloom filter and hash table.
+    vector<u8> bloom_case;
+    vector<RoseLongLitHashEntry> tab_case;
+    if (info.caseful.num_literals) {
+        bloom_case = makeBloomFilter(lits, max_len, false);
+        tab_case = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.caseful.hashed_positions, false);
+    }

-    size_t litTabNumEntries = lits.size() + 1;
-    size_t litTabSize = ROUNDUP_16(litTabNumEntries * sizeof(RoseLongLiteral));
+    // Build nocase bloom filter and hash table.
+    vector<u8> bloom_nocase;
+    vector<RoseLongLitHashEntry> tab_nocase;
+    if (info.nocase.num_literals) {
+        bloom_nocase = makeBloomFilter(lits, max_len, true);
+        tab_nocase = makeHashTable(lits, max_len, litToOffsetVal,
+                                 info.nocase.hashed_positions, true);
+    }

-    size_t wholeLitTabOffset = litTabOffset + litTabSize;
-    size_t totalWholeLitTabSize =
-        ROUNDUP_16(info.caseful.positions + info.nocase.positions);
+    size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob));
+    size_t htOffsetCase = headerSize + wholeLitTabSize;
+    size_t htOffsetNocase = htOffsetCase + byte_length(tab_case);
+    size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase);
+    size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case);

-    size_t htOffsetCase = wholeLitTabOffset + totalWholeLitTabSize;
-    size_t htSizeCase = info.caseful.hashEntries * sizeof(RoseLongLitHashEntry);
-    size_t htOffsetNocase = htOffsetCase + htSizeCase;
-    size_t htSizeNocase =
-        info.nocase.hashEntries * sizeof(RoseLongLitHashEntry);
-
-    size_t tabSize = ROUNDUP_16(htOffsetNocase + htSizeNocase);
+    size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase));

    // need to add +2 to both of these to allow space for the actual largest
    // value as well as handling the fact that we add one to the space when
    // storing out a position to allow zero to mean "no stream state value"
-    u8 streamBitsCase = lg2(roundUpToPowerOfTwo(info.caseful.positions + 2));
-    u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(info.nocase.positions + 2));
+    u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + 2));
+    u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2));
    u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8;

    auto table = aligned_zmalloc_unique<char>(tabSize);
    assert(table); // otherwise would have thrown std::bad_alloc

-    // then fill it in
-    char *ptr = table.get();
-    RoseLongLitTable *header = (RoseLongLitTable *)ptr;
-    // fill in header
+    // Fill in the RoseLongLitTable header structure.
+    RoseLongLitTable *header = (RoseLongLitTable *)(table.get());
+    header->size = verify_u32(tabSize);
    header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
-    header->boundaryCase = info.caseful.boundary;
-    header->hashOffsetCase = verify_u32(htOffsetCase);
-    header->hashNBitsCase = lg2(info.caseful.hashEntries);
-    header->streamStateBitsCase = streamBitsCase;
-    header->boundaryNocase = info.nocase.boundary;
-    header->hashOffsetNocase = verify_u32(htOffsetNocase);
-    header->hashNBitsNocase = lg2(info.nocase.hashEntries);
-    header->streamStateBitsNocase = streamBitsNocase;
+    header->caseful.hashOffset = verify_u32(htOffsetCase);
+    header->caseful.hashBits = lg2(tab_case.size());
+    header->caseful.streamStateBits = streamBitsCase;
+    header->caseful.bloomOffset = verify_u32(bloomOffsetCase);
+    header->caseful.bloomBits = lg2(bloom_case.size() * 8);
+    header->nocase.hashOffset = verify_u32(htOffsetNocase);
+    header->nocase.hashBits = lg2(tab_nocase.size());
+    header->nocase.streamStateBits = streamBitsNocase;
+    header->nocase.bloomOffset = verify_u32(bloomOffsetNocase);
+    header->nocase.bloomBits = lg2(bloom_nocase.size() * 8);
    assert(tot_state_bytes < sizeof(u64a));
    header->streamStateBytes = verify_u8(tot_state_bytes); // u8

-    ptr += headerSize;
-
-    // now fill in the rest
-
-    RoseLongLiteral *litTabPtr = (RoseLongLiteral *)ptr;
-    ptr += litTabSize;
-
-    map<u32, u32> litToOffsetVal;
-    for (auto i = lits.begin(), e = lits.end(); i != e; ++i) {
-        u32 entry = verify_u32(i - lits.begin());
-        u32 offset = verify_u32(ptr - table.get());
-
-        // point the table entry to the string location
-        litTabPtr[entry].offset = offset;
-
-        litToOffsetVal[entry] = offset;
-
-        // copy the string into the string location
-        const auto &s = i->s;
-        memcpy(ptr, s.c_str(), s.size());
-
-        ptr += s.size(); // and the string location
-    }
-
-    // fill in final lit table entry with current ptr (serves as end value)
-    litTabPtr[lits.size()].offset = verify_u32(ptr - table.get());
-
-    // fill hash tables
-    ptr = table.get() + htOffsetCase;
-    fillHashes(lits, max_len, (RoseLongLitHashEntry *)ptr,
-               info.caseful.hashEntries, false, litToOffsetVal);
-    ptr += htSizeCase;
-    fillHashes(lits, max_len, (RoseLongLitHashEntry *)ptr,
-               info.nocase.hashEntries, true, litToOffsetVal);
-    ptr += htSizeNocase;
-
-    assert(ptr <= table.get() + tabSize);
+    // Copy in the literal strings, hash tables and bloom filters,
+    copy_bytes(table.get() + headerSize, lit_blob);
+    copy_bytes(table.get() + htOffsetCase, tab_case);
+    copy_bytes(table.get() + bloomOffsetCase, bloom_case);
+    copy_bytes(table.get() + htOffsetNocase, tab_nocase);
+    copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase);

    DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize);
    DEBUG_PRINTF("requires %zu bytes of history\n", max_len);
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@ -49,9 +49,10 @@
 #include <fstream>
 #include <iomanip>
 #include <map>
+#include <numeric>
 #include <ostream>
-#include <string>
 #include <sstream>
+#include <string>
 #include <utility>

 #ifndef DUMP_SUPPORT
@ -1049,6 +1050,39 @@ void dumpAnchoredStats(const void *atable, FILE *f) {

 }

+static
+void dumpLongLiteralSubtable(const RoseLongLitTable *ll_table,
+                             const RoseLongLitSubtable *ll_sub, FILE *f) {
+    if (!ll_sub->hashBits) {
+        fprintf(f, "      <no table>\n");
+        return;
+    }
+
+    const char *base = (const char *)ll_table;
+
+    u32 nbits = ll_sub->hashBits;
+    u32 num_entries = 1U << nbits;
+    const auto *tab = (const RoseLongLitHashEntry *)(base + ll_sub->hashOffset);
+    u32 hash_occ =
+        count_if(tab, tab + num_entries, [](const RoseLongLitHashEntry &ent) {
+            return ent.str_offset != 0;
+        });
+    float hash_occ_percent = ((float)hash_occ / (float)num_entries) * 100;
+
+    fprintf(f, "      hash table   : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            nbits, hash_occ, num_entries, hash_occ_percent);
+
+    u32 bloom_bits = ll_sub->bloomBits;
+    u32 bloom_size = 1U << bloom_bits;
+    const u8 *bloom = (const u8 *)base + ll_sub->bloomOffset;
+    u32 bloom_occ = accumulate(bloom, bloom + bloom_size / 8, 0,
+        [](const u32 &sum, const u8 &elem) { return sum + popcount32(elem); });
+    float bloom_occ_percent = ((float)bloom_occ / (float)(bloom_size)) * 100;
+
+    fprintf(f, "      bloom filter : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            bloom_bits, bloom_occ, bloom_size, bloom_occ_percent);
+}
+
 static
 void dumpLongLiteralTable(const RoseEngine *t, FILE *f) {
    if (!t->longLitTableOffset) {
@ -1062,17 +1096,15 @@ void dumpLongLiteralTable(const RoseEngine *t, FILE *f) {
        (const struct RoseLongLitTable *)loadFromByteCodeOffset(
            t, t->longLitTableOffset);

-    u32 num_caseful = ll_table->boundaryCase;
-    u32 num_caseless = ll_table->boundaryNocase - num_caseful;
+    fprintf(f, "    total size     : %u bytes\n", ll_table->size);
+    fprintf(f, "    longest len    : %u\n", ll_table->maxLen);
+    fprintf(f, "    stream state   : %u bytes\n", ll_table->streamStateBytes);

-    fprintf(f, "    longest len:  %u\n", ll_table->maxLen);
-    fprintf(f, "    counts:       %u caseful, %u caseless\n", num_caseful,
-            num_caseless);
-    fprintf(f, "    hash bits:    %u caseful, %u caseless\n",
-            ll_table->hashNBitsCase, ll_table->hashNBitsNocase);
-    fprintf(f, "    state bits:   %u caseful, %u caseless\n",
-            ll_table->streamStateBitsCase, ll_table->streamStateBitsNocase);
-    fprintf(f, "    stream state: %u bytes\n", ll_table->streamStateBytes);
+    fprintf(f, "    caseful:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->caseful, f);
+
+    fprintf(f, "    nocase:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->nocase, f);
 }

 // Externally accessible functions
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@ -446,51 +446,49 @@ struct ALIGN_CL_DIRECTIVE anchored_matcher_info {
    u32 anchoredMinDistance; /* start of region to run anchored table over */
 };

+/**
+ * \brief Long literal subtable for a particular mode (caseful or nocase).
+ */
+struct RoseLongLitSubtable {
+    /**
+     * \brief Offset of the hash table (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 hashOffset;
+
+    /**
+     * \brief Offset of the bloom filter (relative to RoseLongLitTable base).
+     *
+     * Offset is zero if no such table exists.
+     */
+    u32 bloomOffset;
+
+    /** \brief lg2 of the size of the hash table. */
+    u8 hashBits;
+
+    /** \brief Size of the bloom filter in bits. */
+    u8 bloomBits;
+
+    /** \brief Number of bits of packed stream state used.  */
+    u8 streamStateBits;
+};
+
 /**
 * \brief Long literal table header.
 */
 struct RoseLongLitTable {
-    /** \brief String ID one beyond the maximum entry for caseful literals. */
-    u32 boundaryCase;
-
    /**
-     * \brief String ID one beyond the maximum entry for caseless literals.
-     * This is also the total size of the literal table.
+     * \brief Total size of the whole table (including strings, bloom filters,
+     * hash tables).
     */
-    u32 boundaryNocase;
+    u32 size;

-    /**
-     * \brief Offset of the caseful hash table (relative to RoseLongLitTable
-     * base).
-     *
-     * Offset is zero if no such table exists.
-     */
-    u32 hashOffsetCase;
+    /** \brief Caseful sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable caseful;

-    /**
-     * \brief Offset of the caseless hash table (relative to RoseLongLitTable
-     * base).
-     *
-     * Offset is zero if no such table exists.
-     */
-    u32 hashOffsetNocase;
-
-    /** \brief lg2 of the size of the caseful hash table. */
-    u32 hashNBitsCase;
-
-    /** \brief lg2 of the size of the caseless hash table. */
-    u32 hashNBitsNocase;
-
-    /**
-     * \brief Number of bits of packed stream state for the caseful hash table.
-     */
-    u8 streamStateBitsCase;
-
-    /**
-     * \brief Number of bits of packed stream state for the caseless hash
-     * table.
-     */
-    u8 streamStateBitsNocase;
+    /** \brief Caseless sub-table (hash table and bloom filter). */
+    struct RoseLongLitSubtable nocase;

    /** \brief Total size of packed stream state in bytes. */
    u8 streamStateBytes;
@ -499,39 +497,19 @@ struct RoseLongLitTable {
    u8 maxLen;
 };

-/**
- * \brief One of these structures per literal entry in our long literal table.
- */
-struct RoseLongLiteral {
-    /**
-     * \brief Offset of the literal string itself, relative to
-     * RoseLongLitTable base.
-     */
-    u32 offset;
-};
-
-/** \brief "No further links" value used for \ref RoseLongLitHashEntry::link. */
-#define LINK_INVALID 0xffffffff
-
 /**
 * \brief One of these structures per hash table entry in our long literal
 * table.
 */
 struct RoseLongLitHashEntry {
    /**
-     * \brief Bitfield used as a quick guard for hash buckets.
-     *
-     * For a given hash value N, the low six bits of N are taken and the
-     * corresponding bit is switched on in this bitfield if this bucket is used
-     * for that hash.
+     * \brief Offset of the literal string itself, relative to
+     * RoseLongLitTable base. Zero if this bucket is empty.
     */
-    u64a bitfield;
+    u32 str_offset;

-    /** \brief Offset in the literal table for this string. */
-    u32 state;
-
-    /** \brief Hash table index of next entry in the chain for this bucket. */
-    u32 link;
+    /** \brief Length of the literal string. */
+    u32 str_len;
 };

 static really_inline
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@ -551,6 +551,11 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
    tctxt->minMatchOffset = offset;
    tctxt->minNonMpvMatchOffset = offset;
    tctxt->next_mpv_offset = 0;
+    tctxt->ll_buf = scratch->core_info.hbuf;
+    tctxt->ll_len = scratch->core_info.hlen;
+    tctxt->ll_buf_nocase = scratch->core_info.hbuf;
+    tctxt->ll_len_nocase = scratch->core_info.hlen;
+
    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n",
                 scratch->core_info.hlen, scratch->core_info.len, tctxt->groups);

@ -590,18 +595,14 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
                MIN(t->floatingDistance, length + offset) - offset : 0;
        }

+        loadLongLiteralState(t, state, scratch);
+
        size_t hlength = scratch->core_info.hlen;
-        char rebuild = 0;
-
-        if (hlength) {
-            // Can only have long literal state or rebuild if this is not the
-            // first write to this stream.
-            loadLongLiteralState(t, state, scratch);
-            rebuild = (scratch->core_info.status & STATUS_DELAY_DIRTY) &&
-                      (t->maxFloatingDelayedMatch == ROSE_BOUND_INF ||
-                       offset < t->maxFloatingDelayedMatch);
-        }

+        char rebuild = hlength &&
+                       (scratch->core_info.status & STATUS_DELAY_DIRTY) &&
+                       (t->maxFloatingDelayedMatch == ROSE_BOUND_INF ||
+                        offset < t->maxFloatingDelayedMatch);
        DEBUG_PRINTF("**rebuild %hhd status %hhu mfdm %u, offset %llu\n",
                     rebuild, scratch->core_info.status,
                     t->maxFloatingDelayedMatch, offset);
--- a/src/rose/stream_long_lit.h
+++ b/src/rose/stream_long_lit.h
@ -36,52 +36,12 @@
 #include "util/copybytes.h"

 static really_inline
-const struct RoseLongLiteral *
-getLitTab(const struct RoseLongLitTable *ll_table) {
-    return (const struct RoseLongLiteral *)((const char *)ll_table +
-            ROUNDUP_16(sizeof(struct RoseLongLitTable)));
-}
-
-static really_inline
-u32 get_start_lit_idx(const struct RoseLongLitTable *ll_table,
-                      const char nocase) {
-    return nocase ? ll_table->boundaryCase : 0;
-}
-
-static really_inline
-u32 get_end_lit_idx(const struct RoseLongLitTable *ll_table,
-                    const char nocase) {
-    return nocase ? ll_table->boundaryNocase : ll_table->boundaryCase;
-}
-
-// search for the literal index that contains the current state
-static rose_inline
-u32 findLitTabEntry(const struct RoseLongLitTable *ll_table,
-                    u32 stateValue, const char nocase) {
-    const struct RoseLongLiteral *litTab = getLitTab(ll_table);
-    u32 lo = get_start_lit_idx(ll_table, nocase);
-    u32 hi = get_end_lit_idx(ll_table, nocase);
-
-    // Now move stateValue back by one so that we're looking for the
-    // litTab entry that includes it the string, not the one 'one past' it
-    stateValue -= 1;
-    assert(lo != hi);
-    assert(litTab[lo].offset <= stateValue);
-    assert(litTab[hi].offset > stateValue);
-
-    // binary search to find the entry e such that:
-    // litTab[e].offsetToLiteral <= stateValue < litTab[e+1].offsetToLiteral
-    while (lo + 1 < hi) {
-        u32 mid = (lo + hi) / 2;
-        if (litTab[mid].offset <= stateValue) {
-            lo = mid;
-        } else { // (litTab[mid].offset > stateValue) {
-            hi = mid;
-        }
-    }
-    assert(litTab[lo].offset <= stateValue);
-    assert(litTab[hi].offset > stateValue);
-    return lo;
+const struct RoseLongLitHashEntry *
+getHashTableBase(const struct RoseLongLitTable *ll_table,
+                 const struct RoseLongLitSubtable *ll_sub) {
+    assert(ll_sub->hashOffset);
+    return (const struct RoseLongLitHashEntry *)((const char *)ll_table +
+                                                 ll_sub->hashOffset);
 }

 // Reads from stream state and unpacks values into stream state table.
@ -94,8 +54,8 @@ void loadLongLitStreamState(const struct RoseLongLitTable *ll_table,
    assert(state_case && state_nocase);

    u8 ss_bytes = ll_table->streamStateBytes;
-    u8 ssb = ll_table->streamStateBitsCase;
-    UNUSED u8 ssb_nc = ll_table->streamStateBitsNocase;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);

 #if defined(ARCH_32_BIT)
@ -116,40 +76,22 @@ void loadLongLitStreamState(const struct RoseLongLitTable *ll_table,
    *state_nocase = (u32)(streamVal >> ssb);
 }

-static really_inline
-u32 getBaseOffsetOfLits(const struct RoseLongLitTable *ll_table,
-                        const char nocase) {
-    u32 lit_idx = get_start_lit_idx(ll_table, nocase);
-    return getLitTab(ll_table)[lit_idx].offset;
-}
-
-static really_inline
-u32 unpackStateVal(const struct RoseLongLitTable *ll_table, const char nocase,
-                   u32 v) {
-    return v + getBaseOffsetOfLits(ll_table, nocase) - 1;
-}
-
-static really_inline
-u32 packStateVal(const struct RoseLongLitTable *ll_table, const char nocase,
-                 u32 v) {
-    return v - getBaseOffsetOfLits(ll_table, nocase) + 1;
-}
-
 static rose_inline
 void loadLongLiteralStateMode(struct hs_scratch *scratch,
                              const struct RoseLongLitTable *ll_table,
-                              const struct RoseLongLiteral *litTab,
+                              const struct RoseLongLitSubtable *ll_sub,
                              const u32 state, const char nocase) {
    if (!state) {
        DEBUG_PRINTF("no state for %s\n", nocase ? "caseless" : "caseful");
        return;
    }

-    u32 stateValue = unpackStateVal(ll_table, nocase, state);
-    u32 idx = findLitTabEntry(ll_table, stateValue, nocase);
-    size_t found_offset = litTab[idx].offset;
-    const u8 *found_buf = found_offset + (const u8 *)ll_table;
-    size_t found_sz = stateValue - found_offset;
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+    const struct RoseLongLitHashEntry *ent = tab + state - 1;
+
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *found_buf = (const u8 *)ll_table + ent->str_offset;
+    size_t found_sz = ent->str_len;

    struct RoseContext *tctxt = &scratch->tctxt;
    if (nocase) {
@ -168,34 +110,42 @@ void loadLongLiteralState(const struct RoseEngine *t, char *state,
        return;
    }

+    // If we don't have any long literals in play, these values must point to
+    // the real history buffer so that CHECK_LITERAL instructions examine the
+    // history buffer.
    scratch->tctxt.ll_buf = scratch->core_info.hbuf;
    scratch->tctxt.ll_len = scratch->core_info.hlen;
    scratch->tctxt.ll_buf_nocase = scratch->core_info.hbuf;
    scratch->tctxt.ll_len_nocase = scratch->core_info.hlen;

+    if (!scratch->core_info.hlen) {
+        return;
+    }
+
    const struct RoseLongLitTable *ll_table =
        getByOffset(t, t->longLitTableOffset);
-    const struct RoseLongLiteral *litTab = getLitTab(ll_table);
    const u8 *ll_state = getLongLitState(t, state);

    u32 state_case;
    u32 state_nocase;
    loadLongLitStreamState(ll_table, ll_state, &state_case, &state_nocase);

-    loadLongLiteralStateMode(scratch, ll_table, litTab, state_case, 0);
-    loadLongLiteralStateMode(scratch, ll_table, litTab, state_nocase, 1);
+    DEBUG_PRINTF("loaded {%u, %u}\n", state_case, state_nocase);
+
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->caseful,
+                             state_case, 0);
+    loadLongLiteralStateMode(scratch, ll_table, &ll_table->nocase,
+                             state_nocase, 1);
 }

 static rose_inline
 char confirmLongLiteral(const struct RoseLongLitTable *ll_table,
-                        const hs_scratch_t *scratch, u32 hashState,
+                        const struct hs_scratch *scratch,
+                        const struct RoseLongLitHashEntry *ent,
                        const char nocase) {
-    const struct RoseLongLiteral *litTab = getLitTab(ll_table);
-    u32 idx = findLitTabEntry(ll_table, hashState, nocase);
-    size_t found_offset = litTab[idx].offset;
-    const u8 *s = found_offset + (const u8 *)ll_table;
-    assert(hashState > found_offset);
-    size_t len = hashState - found_offset;
+    assert(ent->str_offset + ent->str_len <= ll_table->size);
+    const u8 *s = (const u8 *)ll_table + ent->str_offset;
+    size_t len = ent->str_len;
    const u8 *buf = scratch->core_info.buf;
    const size_t buf_len = scratch->core_info.len;

@ -225,14 +175,13 @@ char confirmLongLiteral(const struct RoseLongLitTable *ll_table,
        return 0;
    }

-    DEBUG_PRINTF("confirmed hashState=%u\n", hashState);
    return 1;
 }

 static rose_inline
-void calcStreamingHash(const struct core_info *ci,
-                       const struct RoseLongLitTable *ll_table, u8 hash_len,
-                       u32 *hash_case, u32 *hash_nocase) {
+const u8 *prepScanBuffer(const struct core_info *ci,
+                         const struct RoseLongLitTable *ll_table, u8 *tempbuf) {
+    const u8 hash_len = ll_table->maxLen;
    assert(hash_len >= LONG_LIT_HASH_LEN);

    // Our hash function operates over LONG_LIT_HASH_LEN bytes, starting from
@ -240,7 +189,6 @@ void calcStreamingHash(const struct core_info *ci,
    // entirely from either the current buffer or the history buffer, we pass
    // in the pointer directly; otherwise we must make a copy.

-    u8 tempbuf[LONG_LIT_HASH_LEN];
    const u8 *base;

    if (hash_len > ci->len) {
@ -266,71 +214,7 @@ void calcStreamingHash(const struct core_info *ci,
        base = ci->buf + ci->len - hash_len;
    }

-    if (ll_table->hashNBitsCase) {
-        *hash_case = hashLongLiteral(base, LONG_LIT_HASH_LEN, 0);
-        DEBUG_PRINTF("caseful hash %u\n", *hash_case);
-    }
-    if (ll_table->hashNBitsNocase) {
-        *hash_nocase = hashLongLiteral(base, LONG_LIT_HASH_LEN, 1);
-        DEBUG_PRINTF("caseless hash %u\n", *hash_nocase);
-    }
-}
-
-static really_inline
-const struct RoseLongLitHashEntry *
-getHashTableBase(const struct RoseLongLitTable *ll_table, const char nocase) {
-    const u32 hashOffset = nocase ? ll_table->hashOffsetNocase
-                                  : ll_table->hashOffsetCase;
-    return (const struct RoseLongLitHashEntry *)((const char *)ll_table +
-                                                 hashOffset);
-}
-
-static rose_inline
-const struct RoseLongLitHashEntry *
-getLongLitHashEnt(const struct RoseLongLitTable *ll_table, u32 h,
-                  const char nocase) {
-    u32 nbits = nocase ? ll_table->hashNBitsNocase : ll_table->hashNBitsCase;
-    if (!nbits) {
-        return NULL;
-    }
-
-    u32 h_ent = h & ((1 << nbits) - 1);
-    u32 h_low = (h >> nbits) & 63;
-
-    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, nocase);
-    const struct RoseLongLitHashEntry *ent = tab + h_ent;
-
-    if (!((ent->bitfield >> h_low) & 0x1)) {
-        return NULL;
-    }
-
-    return ent;
-}
-
-static rose_inline
-u32 storeLongLiteralStateMode(const struct hs_scratch *scratch,
-                              const struct RoseLongLitTable *ll_table,
-                              const struct RoseLongLitHashEntry *ent,
-                              const char nocase) {
-    assert(ent);
-    assert(nocase ? ll_table->hashNBitsNocase : ll_table->hashNBitsCase);
-
-    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, nocase);
-
-    u32 packed_state = 0;
-    while (1) {
-        if (confirmLongLiteral(ll_table, scratch, ent->state, nocase)) {
-            packed_state = packStateVal(ll_table, nocase, ent->state);
-            DEBUG_PRINTF("set %s state to %u\n", nocase ? "nocase" : "case",
-                         packed_state);
-            break;
-        }
-        if (ent->link == LINK_INVALID) {
-            break;
-        }
-        ent = tab + ent->link;
-    }
-    return packed_state;
+    return base;
 }

 #ifndef NDEBUG
@ -359,8 +243,8 @@ void storeLongLitStreamState(const struct RoseLongLitTable *ll_table,
    assert(ll_state);

    u8 ss_bytes = ll_table->streamStateBytes;
-    u8 ssb = ll_table->streamStateBitsCase;
-    UNUSED u8 ssb_nc = ll_table->streamStateBitsNocase;
+    u8 ssb = ll_table->caseful.streamStateBits;
+    UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits;
    assert(ss_bytes == ROUNDUP_N(ssb + ssb_nc, 8) / 8);
    assert(!streamingTableOverflow(state_case, state_nocase, ssb, ssb_nc));

@ -380,6 +264,65 @@ void storeLongLitStreamState(const struct RoseLongLitTable *ll_table,
    partial_store_u64a(ll_state, stagingStreamState, ss_bytes);
 }

+static really_inline
+char has_bit(const u8 *data, u32 bit) {
+    return (data[bit / 8] >> (bit % 8)) & 1;
+}
+
+static rose_inline
+char bloomHasKey(const u8 *bloom, u32 bloom_mask, u32 hash) {
+    return has_bit(bloom, hash & bloom_mask);
+}
+
+static rose_inline
+char checkBloomFilter(const struct RoseLongLitTable *ll_table,
+                      const struct RoseLongLitSubtable *ll_sub,
+                      const u8 *scan_buf, char nocase) {
+    assert(ll_sub->bloomBits);
+
+    const u8 *bloom = (const u8 *)ll_table + ll_sub->bloomOffset;
+    const u32 bloom_mask = (1U << ll_sub->bloomBits) - 1;
+
+    char v = 1;
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_1(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_2(scan_buf, nocase));
+    v &= bloomHasKey(bloom, bloom_mask, bloomHash_3(scan_buf, nocase));
+    return v;
+}
+
+/**
+ * \brief Look for a hit in the hash table.
+ *
+ * Returns zero if not found, otherwise returns (bucket + 1).
+ */
+static rose_inline
+u32 checkHashTable(const struct RoseLongLitTable *ll_table,
+                   const struct RoseLongLitSubtable *ll_sub, const u8 *scan_buf,
+                   const struct hs_scratch *scratch, char nocase) {
+    const u32 nbits = ll_sub->hashBits;
+    assert(nbits && nbits < 32);
+    const u32 num_entries = 1U << nbits;
+
+    const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub);
+
+    u32 hash = hashLongLiteral(scan_buf, LONG_LIT_HASH_LEN, nocase);
+    u32 bucket = hash & ((1U << nbits) - 1);
+
+    while (tab[bucket].str_offset != 0) {
+        DEBUG_PRINTF("checking bucket %u\n", bucket);
+        if (confirmLongLiteral(ll_table, scratch, &tab[bucket], nocase)) {
+            DEBUG_PRINTF("found hit for bucket %u\n", bucket);
+            return bucket + 1;
+        }
+
+        if (++bucket == num_entries) {
+            bucket = 0;
+        }
+    }
+
+    return 0;
+}
+
 static rose_inline
 void storeLongLiteralState(const struct RoseEngine *t, char *state,
                           struct hs_scratch *scratch) {
@ -401,28 +344,22 @@ void storeLongLiteralState(const struct RoseEngine *t, char *state,

    // If we don't have enough history, we don't need to do anything.
    if (ll_table->maxLen <= ci->len + ci->hlen) {
-        u32 hash_case = 0;
-        u32 hash_nocase = 0;
+        u8 tempbuf[LONG_LIT_HASH_LEN];
+        const u8 *scan_buf = prepScanBuffer(ci, ll_table, tempbuf);

-        calcStreamingHash(ci, ll_table, ll_table->maxLen, &hash_case,
-                          &hash_nocase);
-
-        const struct RoseLongLitHashEntry *ent_case =
-            getLongLitHashEnt(ll_table, hash_case, 0);
-        const struct RoseLongLitHashEntry *ent_nocase =
-            getLongLitHashEnt(ll_table, hash_nocase, 1);
-
-        DEBUG_PRINTF("ent_caseful=%p, ent_caseless=%p\n", ent_case, ent_nocase);
-
-        if (ent_case) {
-            state_case = storeLongLiteralStateMode(scratch, ll_table,
-                                                   ent_case, 0);
+        if (ll_table->caseful.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->caseful, scan_buf, 0)) {
+            state_case = checkHashTable(ll_table, &ll_table->caseful, scan_buf,
+                                        scratch, 0);
        }

-        if (ent_nocase) {
-            state_nocase = storeLongLiteralStateMode(scratch, ll_table,
-                                                     ent_nocase, 1);
+        if (ll_table->nocase.hashBits &&
+            checkBloomFilter(ll_table, &ll_table->nocase, scan_buf, 1)) {
+            state_nocase = checkHashTable(ll_table, &ll_table->nocase, scan_buf,
+                                          scratch, 1);
        }
+    } else {
+        DEBUG_PRINTF("not enough history (%zu bytes)\n", ci->len + ci->hlen);
    }

    DEBUG_PRINTF("store {%u, %u}\n", state_case, state_nocase);
--- a/src/rose/stream_long_lit_hash.h
+++ b/src/rose/stream_long_lit_hash.h
@ -30,17 +30,18 @@
 #define STREAM_LONG_LIT_HASH_H

 #include "ue2common.h"
+#include "util/bitutils.h"
 #include "util/unaligned.h"

 /** \brief Length of the buffer operated on by \ref hashLongLiteral(). */
 #define LONG_LIT_HASH_LEN 24

+/** \brief Multiplier used by al the hash functions below. */
+#define HASH_MULTIPLIER 0x0b4e0ef37bc32127ULL
+
 /** \brief Hash function used for long literal table in streaming mode. */
 static really_inline
 u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
-    const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
-    const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
-
    // We unconditionally hash LONG_LIT_HASH_LEN bytes; all use cases of this
    // hash are for strings longer than this.
    assert(len >= 24);
@ -49,17 +50,56 @@ u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
    u64a v2 = unaligned_load_u64a(ptr + 8);
    u64a v3 = unaligned_load_u64a(ptr + 16);
    if (nocase) {
-        v1 &= CASEMASK;
-        v2 &= CASEMASK;
-        v3 &= CASEMASK;
+        v1 &= OCTO_CASE_CLEAR;
+        v2 &= OCTO_CASE_CLEAR;
+        v3 &= OCTO_CASE_CLEAR;
    }
-    v1 *= MULTIPLIER;
-    v2 *= MULTIPLIER * MULTIPLIER;
-    v3 *= MULTIPLIER * MULTIPLIER * MULTIPLIER;
+    v1 *= HASH_MULTIPLIER;
+    v2 *= HASH_MULTIPLIER * HASH_MULTIPLIER;
+    v3 *= HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
    v1 >>= 32;
    v2 >>= 32;
    v3 >>= 32;
    return v1 ^ v2 ^ v3;
 }

+/**
+ * \brief Internal, used by the bloom filter hash functions below. Hashes 16
+ * bytes beginning at (ptr + offset).
+ */
+static really_inline
+u32 bloomHash_i(const u8 *ptr, u32 offset, u64a multiplier, char nocase) {
+    assert(offset + 16 <= LONG_LIT_HASH_LEN);
+
+    u64a v = unaligned_load_u64a(ptr + offset);
+    if (nocase) {
+        v &= OCTO_CASE_CLEAR;
+    }
+    v *= multiplier;
+    return v >> 32;
+}
+
+/*
+ * We ensure that we see every byte the first LONG_LIT_HASH_LEN bytes of input
+ * data (using at least one of the following functions).
+ */
+
+static really_inline
+u32 bloomHash_1(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 0, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_2(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 4, multiplier, nocase);
+}
+
+static really_inline
+u32 bloomHash_3(const u8 *ptr, char nocase) {
+    const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
+    return bloomHash_i(ptr, 8, multiplier, nocase);
+}
+
 #endif // STREAM_LONG_LIT_HASH_H
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@ -70,6 +70,7 @@
 #define CASE_BIT          0x20
 #define CASE_CLEAR        0xdf
 #define DOUBLE_CASE_CLEAR 0xdfdf
+#define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL

 static really_inline
 u32 clz32(u32 x) {