mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
rose_build_matchers: be more careful w/ mixed-case
Overhaul the way fragment literals are added to HWLM and accel, fix some bugs shaken out by stricter mask use.
This commit is contained in:
parent
3bd0c7f6ad
commit
c83f2ea389
@ -395,7 +395,11 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
|
||||
static
|
||||
void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
|
||||
vector<u8> &cmp) {
|
||||
if (id.msk.empty() && !mixed_sensitivity(id.s)) {
|
||||
const size_t suffix_len = min(id.s.length(), size_t{HWLM_MASKLEN});
|
||||
bool mixed_suffix = mixed_sensitivity_in(id.s.end() - suffix_len,
|
||||
id.s.end());
|
||||
|
||||
if (id.msk.empty() && !mixed_suffix) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -415,10 +419,9 @@ void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
|
||||
}
|
||||
}
|
||||
|
||||
if (mixed_sensitivity(id.s)) {
|
||||
if (mixed_suffix) {
|
||||
auto it = id.s.rbegin();
|
||||
for (size_t i = 0, i_end = min(id.s.length(), size_t{HWLM_MASKLEN});
|
||||
i < i_end; ++i, ++it) {
|
||||
for (size_t i = 0; i < suffix_len; ++i, ++it) {
|
||||
const auto &c = *it;
|
||||
if (!c.nocase) {
|
||||
size_t offset = HWLM_MASKLEN - i - 1;
|
||||
@ -683,6 +686,81 @@ struct MatcherProto {
|
||||
};
|
||||
}
|
||||
|
||||
static
|
||||
void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
|
||||
const LitFragment &f, u32 id, bool delay_rebuild,
|
||||
size_t max_len) {
|
||||
const rose_literal_id &lit = build.literals.at(id);
|
||||
assert(id < build.literal_info.size());
|
||||
const auto &info = build.literal_info.at(id);
|
||||
|
||||
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(lit.s).c_str(),
|
||||
lit.s.length());
|
||||
|
||||
vector<u8> msk = lit.msk; // copy
|
||||
vector<u8> cmp = lit.cmp; // copy
|
||||
bool noruns = isNoRunsLiteral(build, id, info, max_len);
|
||||
|
||||
auto lit_final = lit.s; // copy
|
||||
|
||||
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
|
||||
DEBUG_PRINTF("truncating to tail of length %zu\n",
|
||||
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
|
||||
lit_final.erase(0, lit_final.length() - ROSE_SHORT_LITERAL_LEN_MAX);
|
||||
// We shouldn't have set a threshold below 8 chars.
|
||||
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
|
||||
assert(!noruns);
|
||||
}
|
||||
|
||||
addLiteralMask(lit, msk, cmp);
|
||||
|
||||
const auto &s_final = lit_final.get_string();
|
||||
bool nocase = lit_final.any_nocase();
|
||||
|
||||
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n",
|
||||
f.fragment_id, escapeString(s_final).c_str(), (int)nocase,
|
||||
noruns, dumpMask(msk).c_str(), dumpMask(cmp).c_str());
|
||||
|
||||
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
|
||||
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
||||
return;
|
||||
}
|
||||
|
||||
u32 prog_offset =
|
||||
delay_rebuild ? f.delay_program_offset : f.lit_program_offset;
|
||||
const auto &groups = f.groups;
|
||||
|
||||
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset, groups,
|
||||
msk, cmp);
|
||||
}
|
||||
|
||||
static
|
||||
void addAccelLiteral(MatcherProto &mp, const rose_literal_id &lit,
|
||||
const rose_literal_info &info, size_t max_len) {
|
||||
const auto &s = lit.s; // copy
|
||||
|
||||
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(s).c_str(), s.length());
|
||||
|
||||
vector<u8> msk = lit.msk; // copy
|
||||
vector<u8> cmp = lit.cmp; // copy
|
||||
addLiteralMask(lit, msk, cmp);
|
||||
|
||||
if (!maskIsConsistent(s.get_string(), s.any_nocase(), msk, cmp)) {
|
||||
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// Literals used for acceleration must be limited to max_len, as that's all
|
||||
// we can see in history.
|
||||
string s_final = lit.s.get_string();
|
||||
trim_to_suffix(s_final, max_len);
|
||||
trim_to_suffix(msk, max_len);
|
||||
trim_to_suffix(cmp, max_len);
|
||||
|
||||
mp.accel_lits.emplace_back(s_final, lit.s.any_nocase(), msk, cmp,
|
||||
info.group_mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Build up a vector of literals (and associated other data) for the
|
||||
* given table.
|
||||
@ -702,27 +780,28 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
|
||||
assert(build.cc.streaming);
|
||||
}
|
||||
|
||||
for (const auto &f : fragments) {
|
||||
for (u32 id : f.lit_ids) {
|
||||
const rose_literal_id &lit = build.literals.at(id);
|
||||
vector<u32> used_lit_ids;
|
||||
|
||||
if (lit.table != table) {
|
||||
continue; /* wrong table */
|
||||
for (const auto &f : fragments) {
|
||||
assert(!f.lit_ids.empty());
|
||||
|
||||
// All literals that share a fragment are in the same table.
|
||||
if (build.literals.at(f.lit_ids.front()).table != table) {
|
||||
continue; // next fragment.
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("fragment %u, %zu lit_ids\n", f.fragment_id,
|
||||
f.lit_ids.size());
|
||||
|
||||
used_lit_ids.clear();
|
||||
for (u32 id : f.lit_ids) {
|
||||
const rose_literal_id &lit = build.literals.at(id);
|
||||
assert(id < build.literal_info.size());
|
||||
const auto &info = build.literal_info.at(id);
|
||||
if (lit.delay) {
|
||||
continue; /* delay id's are virtual-ish */
|
||||
}
|
||||
|
||||
assert(id < build.literal_info.size());
|
||||
const auto &info = build.literal_info.at(id);
|
||||
|
||||
/* Note: requires_benefits are handled in the literal entries */
|
||||
const ue2_literal &s = lit.s;
|
||||
|
||||
DEBUG_PRINTF("lit='%s' (len %zu)\n", escapeString(s).c_str(),
|
||||
s.length());
|
||||
|
||||
// When building the delay rebuild table, we only want to include
|
||||
// literals that have delayed variants.
|
||||
if (delay_rebuild && info.delayed_ids.empty()) {
|
||||
@ -739,69 +818,39 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
|
||||
}
|
||||
}
|
||||
|
||||
vector<u8> msk = lit.msk; // copy
|
||||
vector<u8> cmp = lit.cmp; // copy
|
||||
bool noruns = isNoRunsLiteral(build, id, info, max_len);
|
||||
used_lit_ids.push_back(id);
|
||||
}
|
||||
|
||||
size_t lit_hist_len = 0;
|
||||
if (used_lit_ids.empty()) {
|
||||
continue; // next fragment.
|
||||
}
|
||||
|
||||
// Build our fragment (for the HWLM matcher) from the first literal.
|
||||
addFragmentLiteral(build, mp, f, used_lit_ids.front(), delay_rebuild,
|
||||
max_len);
|
||||
|
||||
for (u32 id : used_lit_ids) {
|
||||
const rose_literal_id &lit = build.literals.at(id);
|
||||
assert(id < build.literal_info.size());
|
||||
const auto &info = build.literal_info.at(id);
|
||||
|
||||
// All literals contribute accel information.
|
||||
addAccelLiteral(mp, lit, info, max_len);
|
||||
|
||||
// All literals contribute to history requirement in streaming mode.
|
||||
if (build.cc.streaming) {
|
||||
lit_hist_len = max(msk.size(), min(s.length(), max_len));
|
||||
size_t lit_hist_len =
|
||||
max(lit.msk.size(), min(lit.s.length(), max_len));
|
||||
lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
|
||||
}
|
||||
DEBUG_PRINTF("lit requires %zu bytes of history\n", lit_hist_len);
|
||||
DEBUG_PRINTF("lit requires %zu bytes of history\n",
|
||||
lit_hist_len);
|
||||
assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
|
||||
|
||||
auto lit_final = s; // copy
|
||||
|
||||
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
|
||||
DEBUG_PRINTF("truncating to tail of length %zu\n",
|
||||
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
|
||||
lit_final.erase(0, lit_final.length()
|
||||
- ROSE_SHORT_LITERAL_LEN_MAX);
|
||||
// We shouldn't have set a threshold below 8 chars.
|
||||
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
|
||||
assert(!noruns);
|
||||
}
|
||||
|
||||
addLiteralMask(lit, msk, cmp);
|
||||
|
||||
const auto &s_final = lit_final.get_string();
|
||||
bool nocase = lit_final.any_nocase();
|
||||
|
||||
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
|
||||
"cmp=%s\n", f.fragment_id,
|
||||
escapeString(s_final).c_str(), (int)nocase, noruns,
|
||||
dumpMask(msk).c_str(), dumpMask(cmp).c_str());
|
||||
|
||||
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
|
||||
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
mp.accel_lits.emplace_back(s.get_string(), s.any_nocase(), msk, cmp,
|
||||
info.group_mask);
|
||||
mp.history_required = max(mp.history_required, lit_hist_len);
|
||||
|
||||
u32 prog_offset = delay_rebuild ? f.delay_program_offset
|
||||
: f.lit_program_offset;
|
||||
const auto &groups = f.groups;
|
||||
|
||||
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset,
|
||||
groups, msk, cmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort_and_unique(mp.lits);
|
||||
|
||||
// Literals used for acceleration must be limited to max_len, as that's all
|
||||
// we can see in history.
|
||||
for_each(begin(mp.accel_lits), end(mp.accel_lits),
|
||||
[&max_len](AccelString &a) {
|
||||
trim_to_suffix(a.s, max_len);
|
||||
trim_to_suffix(a.msk, max_len);
|
||||
trim_to_suffix(a.cmp, max_len);
|
||||
});
|
||||
|
||||
sort_and_unique(mp.accel_lits);
|
||||
|
||||
return mp;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -320,23 +320,6 @@ bool ue2_literal::any_nocase() const {
|
||||
return find(nocase.begin(), nocase.end(), true) != nocase.end();
|
||||
}
|
||||
|
||||
bool mixed_sensitivity(const ue2_literal &s) {
|
||||
bool cs = false;
|
||||
bool nc = false;
|
||||
for (ue2_literal::const_iterator it = s.begin(); it != s.end(); ++it) {
|
||||
if (!ourisalpha(it->c)) {
|
||||
continue;
|
||||
}
|
||||
if (it->nocase) {
|
||||
nc = true;
|
||||
} else {
|
||||
cs = true;
|
||||
}
|
||||
}
|
||||
|
||||
return cs && nc;
|
||||
}
|
||||
|
||||
void make_nocase(ue2_literal *lit) {
|
||||
ue2_literal rv;
|
||||
|
||||
|
@ -35,6 +35,7 @@
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/hash.h"
|
||||
|
||||
#include <iterator>
|
||||
@ -226,9 +227,36 @@ size_t maxStringSelfOverlap(const ue2_literal &a);
|
||||
size_t minStringPeriod(const ue2_literal &a);
|
||||
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
|
||||
|
||||
/** \brief True iff the literal cannot be considered entirely case-sensitive
|
||||
* nor entirely case-insensitive */
|
||||
bool mixed_sensitivity(const ue2_literal &lit);
|
||||
/**
|
||||
* \brief True iff the range of a literal given cannot be considered entirely
|
||||
* case-sensitive nor entirely case-insensitive.
|
||||
*/
|
||||
template<class Iter>
|
||||
bool mixed_sensitivity_in(Iter begin, Iter end) {
|
||||
bool cs = false;
|
||||
bool nc = false;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
if (!ourisalpha(it->c)) {
|
||||
continue;
|
||||
}
|
||||
if (it->nocase) {
|
||||
nc = true;
|
||||
} else {
|
||||
cs = true;
|
||||
}
|
||||
}
|
||||
|
||||
return cs && nc;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief True iff the literal cannot be considered entirely case-sensitive
|
||||
* nor entirely case-insensitive.
|
||||
*/
|
||||
inline
|
||||
bool mixed_sensitivity(const ue2_literal &s) {
|
||||
return mixed_sensitivity_in(s.begin(), s.end());
|
||||
}
|
||||
|
||||
void make_nocase(ue2_literal *lit);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user