mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2026-01-02 06:34:41 +03:00
rose: simplify long lit table, add bloom filter
Replaces the original long lit hash table (used in streaming mode) with a smaller, simpler linear probing approach. Adds a bloom filter in front of it to reduce time spent on false positives. Sizing of both the hash table and bloom filter are done based on max load.
This commit is contained in:
committed by
Matthew Barr
parent
68bf473e2e
commit
8869dee643
@@ -30,17 +30,18 @@
|
||||
#define STREAM_LONG_LIT_HASH_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/unaligned.h"
|
||||
|
||||
/** \brief Length of the buffer operated on by \ref hashLongLiteral(). */
|
||||
#define LONG_LIT_HASH_LEN 24
|
||||
|
||||
/** \brief Multiplier used by al the hash functions below. */
|
||||
#define HASH_MULTIPLIER 0x0b4e0ef37bc32127ULL
|
||||
|
||||
/** \brief Hash function used for long literal table in streaming mode. */
|
||||
static really_inline
|
||||
u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
|
||||
const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
|
||||
const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
|
||||
|
||||
// We unconditionally hash LONG_LIT_HASH_LEN bytes; all use cases of this
|
||||
// hash are for strings longer than this.
|
||||
assert(len >= 24);
|
||||
@@ -49,17 +50,56 @@ u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) {
|
||||
u64a v2 = unaligned_load_u64a(ptr + 8);
|
||||
u64a v3 = unaligned_load_u64a(ptr + 16);
|
||||
if (nocase) {
|
||||
v1 &= CASEMASK;
|
||||
v2 &= CASEMASK;
|
||||
v3 &= CASEMASK;
|
||||
v1 &= OCTO_CASE_CLEAR;
|
||||
v2 &= OCTO_CASE_CLEAR;
|
||||
v3 &= OCTO_CASE_CLEAR;
|
||||
}
|
||||
v1 *= MULTIPLIER;
|
||||
v2 *= MULTIPLIER * MULTIPLIER;
|
||||
v3 *= MULTIPLIER * MULTIPLIER * MULTIPLIER;
|
||||
v1 *= HASH_MULTIPLIER;
|
||||
v2 *= HASH_MULTIPLIER * HASH_MULTIPLIER;
|
||||
v3 *= HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
|
||||
v1 >>= 32;
|
||||
v2 >>= 32;
|
||||
v3 >>= 32;
|
||||
return v1 ^ v2 ^ v3;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Internal, used by the bloom filter hash functions below. Hashes 16
|
||||
* bytes beginning at (ptr + offset).
|
||||
*/
|
||||
static really_inline
|
||||
u32 bloomHash_i(const u8 *ptr, u32 offset, u64a multiplier, char nocase) {
|
||||
assert(offset + 16 <= LONG_LIT_HASH_LEN);
|
||||
|
||||
u64a v = unaligned_load_u64a(ptr + offset);
|
||||
if (nocase) {
|
||||
v &= OCTO_CASE_CLEAR;
|
||||
}
|
||||
v *= multiplier;
|
||||
return v >> 32;
|
||||
}
|
||||
|
||||
/*
|
||||
* We ensure that we see every byte the first LONG_LIT_HASH_LEN bytes of input
|
||||
* data (using at least one of the following functions).
|
||||
*/
|
||||
|
||||
static really_inline
|
||||
u32 bloomHash_1(const u8 *ptr, char nocase) {
|
||||
const u64a multiplier = HASH_MULTIPLIER;
|
||||
return bloomHash_i(ptr, 0, multiplier, nocase);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 bloomHash_2(const u8 *ptr, char nocase) {
|
||||
const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER;
|
||||
return bloomHash_i(ptr, 4, multiplier, nocase);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 bloomHash_3(const u8 *ptr, char nocase) {
|
||||
const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER;
|
||||
return bloomHash_i(ptr, 8, multiplier, nocase);
|
||||
}
|
||||
|
||||
#endif // STREAM_LONG_LIT_HASH_H
|
||||
|
||||
Reference in New Issue
Block a user