mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
rose_build_matchers: be more careful w/ mixed-case
Overhaul the way fragment literals are added to HWLM and accel, fix some bugs shaken out by stricter mask use.
This commit is contained in:
parent
3bd0c7f6ad
commit
c83f2ea389
@ -395,7 +395,11 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
|
|||||||
static
|
static
|
||||||
void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
|
void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
|
||||||
vector<u8> &cmp) {
|
vector<u8> &cmp) {
|
||||||
if (id.msk.empty() && !mixed_sensitivity(id.s)) {
|
const size_t suffix_len = min(id.s.length(), size_t{HWLM_MASKLEN});
|
||||||
|
bool mixed_suffix = mixed_sensitivity_in(id.s.end() - suffix_len,
|
||||||
|
id.s.end());
|
||||||
|
|
||||||
|
if (id.msk.empty() && !mixed_suffix) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -415,10 +419,9 @@ void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mixed_sensitivity(id.s)) {
|
if (mixed_suffix) {
|
||||||
auto it = id.s.rbegin();
|
auto it = id.s.rbegin();
|
||||||
for (size_t i = 0, i_end = min(id.s.length(), size_t{HWLM_MASKLEN});
|
for (size_t i = 0; i < suffix_len; ++i, ++it) {
|
||||||
i < i_end; ++i, ++it) {
|
|
||||||
const auto &c = *it;
|
const auto &c = *it;
|
||||||
if (!c.nocase) {
|
if (!c.nocase) {
|
||||||
size_t offset = HWLM_MASKLEN - i - 1;
|
size_t offset = HWLM_MASKLEN - i - 1;
|
||||||
@ -683,6 +686,81 @@ struct MatcherProto {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
|
||||||
|
const LitFragment &f, u32 id, bool delay_rebuild,
|
||||||
|
size_t max_len) {
|
||||||
|
const rose_literal_id &lit = build.literals.at(id);
|
||||||
|
assert(id < build.literal_info.size());
|
||||||
|
const auto &info = build.literal_info.at(id);
|
||||||
|
|
||||||
|
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(lit.s).c_str(),
|
||||||
|
lit.s.length());
|
||||||
|
|
||||||
|
vector<u8> msk = lit.msk; // copy
|
||||||
|
vector<u8> cmp = lit.cmp; // copy
|
||||||
|
bool noruns = isNoRunsLiteral(build, id, info, max_len);
|
||||||
|
|
||||||
|
auto lit_final = lit.s; // copy
|
||||||
|
|
||||||
|
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
|
||||||
|
DEBUG_PRINTF("truncating to tail of length %zu\n",
|
||||||
|
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
|
||||||
|
lit_final.erase(0, lit_final.length() - ROSE_SHORT_LITERAL_LEN_MAX);
|
||||||
|
// We shouldn't have set a threshold below 8 chars.
|
||||||
|
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
|
||||||
|
assert(!noruns);
|
||||||
|
}
|
||||||
|
|
||||||
|
addLiteralMask(lit, msk, cmp);
|
||||||
|
|
||||||
|
const auto &s_final = lit_final.get_string();
|
||||||
|
bool nocase = lit_final.any_nocase();
|
||||||
|
|
||||||
|
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n",
|
||||||
|
f.fragment_id, escapeString(s_final).c_str(), (int)nocase,
|
||||||
|
noruns, dumpMask(msk).c_str(), dumpMask(cmp).c_str());
|
||||||
|
|
||||||
|
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
|
||||||
|
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 prog_offset =
|
||||||
|
delay_rebuild ? f.delay_program_offset : f.lit_program_offset;
|
||||||
|
const auto &groups = f.groups;
|
||||||
|
|
||||||
|
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset, groups,
|
||||||
|
msk, cmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void addAccelLiteral(MatcherProto &mp, const rose_literal_id &lit,
|
||||||
|
const rose_literal_info &info, size_t max_len) {
|
||||||
|
const auto &s = lit.s; // copy
|
||||||
|
|
||||||
|
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(s).c_str(), s.length());
|
||||||
|
|
||||||
|
vector<u8> msk = lit.msk; // copy
|
||||||
|
vector<u8> cmp = lit.cmp; // copy
|
||||||
|
addLiteralMask(lit, msk, cmp);
|
||||||
|
|
||||||
|
if (!maskIsConsistent(s.get_string(), s.any_nocase(), msk, cmp)) {
|
||||||
|
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Literals used for acceleration must be limited to max_len, as that's all
|
||||||
|
// we can see in history.
|
||||||
|
string s_final = lit.s.get_string();
|
||||||
|
trim_to_suffix(s_final, max_len);
|
||||||
|
trim_to_suffix(msk, max_len);
|
||||||
|
trim_to_suffix(cmp, max_len);
|
||||||
|
|
||||||
|
mp.accel_lits.emplace_back(s_final, lit.s.any_nocase(), msk, cmp,
|
||||||
|
info.group_mask);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Build up a vector of literals (and associated other data) for the
|
* \brief Build up a vector of literals (and associated other data) for the
|
||||||
* given table.
|
* given table.
|
||||||
@ -702,26 +780,27 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
|
|||||||
assert(build.cc.streaming);
|
assert(build.cc.streaming);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector<u32> used_lit_ids;
|
||||||
|
|
||||||
for (const auto &f : fragments) {
|
for (const auto &f : fragments) {
|
||||||
|
assert(!f.lit_ids.empty());
|
||||||
|
|
||||||
|
// All literals that share a fragment are in the same table.
|
||||||
|
if (build.literals.at(f.lit_ids.front()).table != table) {
|
||||||
|
continue; // next fragment.
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_PRINTF("fragment %u, %zu lit_ids\n", f.fragment_id,
|
||||||
|
f.lit_ids.size());
|
||||||
|
|
||||||
|
used_lit_ids.clear();
|
||||||
for (u32 id : f.lit_ids) {
|
for (u32 id : f.lit_ids) {
|
||||||
const rose_literal_id &lit = build.literals.at(id);
|
const rose_literal_id &lit = build.literals.at(id);
|
||||||
|
|
||||||
if (lit.table != table) {
|
|
||||||
continue; /* wrong table */
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lit.delay) {
|
|
||||||
continue; /* delay id's are virtual-ish */
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(id < build.literal_info.size());
|
assert(id < build.literal_info.size());
|
||||||
const auto &info = build.literal_info.at(id);
|
const auto &info = build.literal_info.at(id);
|
||||||
|
if (lit.delay) {
|
||||||
/* Note: requires_benefits are handled in the literal entries */
|
continue; /* delay id's are virtual-ish */
|
||||||
const ue2_literal &s = lit.s;
|
}
|
||||||
|
|
||||||
DEBUG_PRINTF("lit='%s' (len %zu)\n", escapeString(s).c_str(),
|
|
||||||
s.length());
|
|
||||||
|
|
||||||
// When building the delay rebuild table, we only want to include
|
// When building the delay rebuild table, we only want to include
|
||||||
// literals that have delayed variants.
|
// literals that have delayed variants.
|
||||||
@ -739,69 +818,39 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<u8> msk = lit.msk; // copy
|
used_lit_ids.push_back(id);
|
||||||
vector<u8> cmp = lit.cmp; // copy
|
}
|
||||||
bool noruns = isNoRunsLiteral(build, id, info, max_len);
|
|
||||||
|
|
||||||
size_t lit_hist_len = 0;
|
if (used_lit_ids.empty()) {
|
||||||
|
continue; // next fragment.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build our fragment (for the HWLM matcher) from the first literal.
|
||||||
|
addFragmentLiteral(build, mp, f, used_lit_ids.front(), delay_rebuild,
|
||||||
|
max_len);
|
||||||
|
|
||||||
|
for (u32 id : used_lit_ids) {
|
||||||
|
const rose_literal_id &lit = build.literals.at(id);
|
||||||
|
assert(id < build.literal_info.size());
|
||||||
|
const auto &info = build.literal_info.at(id);
|
||||||
|
|
||||||
|
// All literals contribute accel information.
|
||||||
|
addAccelLiteral(mp, lit, info, max_len);
|
||||||
|
|
||||||
|
// All literals contribute to history requirement in streaming mode.
|
||||||
if (build.cc.streaming) {
|
if (build.cc.streaming) {
|
||||||
lit_hist_len = max(msk.size(), min(s.length(), max_len));
|
size_t lit_hist_len =
|
||||||
|
max(lit.msk.size(), min(lit.s.length(), max_len));
|
||||||
lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
|
lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
|
||||||
|
DEBUG_PRINTF("lit requires %zu bytes of history\n",
|
||||||
|
lit_hist_len);
|
||||||
|
assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
|
||||||
|
mp.history_required = max(mp.history_required, lit_hist_len);
|
||||||
}
|
}
|
||||||
DEBUG_PRINTF("lit requires %zu bytes of history\n", lit_hist_len);
|
|
||||||
assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
|
|
||||||
|
|
||||||
auto lit_final = s; // copy
|
|
||||||
|
|
||||||
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
|
|
||||||
DEBUG_PRINTF("truncating to tail of length %zu\n",
|
|
||||||
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
|
|
||||||
lit_final.erase(0, lit_final.length()
|
|
||||||
- ROSE_SHORT_LITERAL_LEN_MAX);
|
|
||||||
// We shouldn't have set a threshold below 8 chars.
|
|
||||||
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
|
|
||||||
assert(!noruns);
|
|
||||||
}
|
|
||||||
|
|
||||||
addLiteralMask(lit, msk, cmp);
|
|
||||||
|
|
||||||
const auto &s_final = lit_final.get_string();
|
|
||||||
bool nocase = lit_final.any_nocase();
|
|
||||||
|
|
||||||
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
|
|
||||||
"cmp=%s\n", f.fragment_id,
|
|
||||||
escapeString(s_final).c_str(), (int)nocase, noruns,
|
|
||||||
dumpMask(msk).c_str(), dumpMask(cmp).c_str());
|
|
||||||
|
|
||||||
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
|
|
||||||
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
mp.accel_lits.emplace_back(s.get_string(), s.any_nocase(), msk, cmp,
|
|
||||||
info.group_mask);
|
|
||||||
mp.history_required = max(mp.history_required, lit_hist_len);
|
|
||||||
|
|
||||||
u32 prog_offset = delay_rebuild ? f.delay_program_offset
|
|
||||||
: f.lit_program_offset;
|
|
||||||
const auto &groups = f.groups;
|
|
||||||
|
|
||||||
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset,
|
|
||||||
groups, msk, cmp);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sort_and_unique(mp.lits);
|
sort_and_unique(mp.lits);
|
||||||
|
|
||||||
// Literals used for acceleration must be limited to max_len, as that's all
|
|
||||||
// we can see in history.
|
|
||||||
for_each(begin(mp.accel_lits), end(mp.accel_lits),
|
|
||||||
[&max_len](AccelString &a) {
|
|
||||||
trim_to_suffix(a.s, max_len);
|
|
||||||
trim_to_suffix(a.msk, max_len);
|
|
||||||
trim_to_suffix(a.cmp, max_len);
|
|
||||||
});
|
|
||||||
|
|
||||||
sort_and_unique(mp.accel_lits);
|
sort_and_unique(mp.accel_lits);
|
||||||
|
|
||||||
return mp;
|
return mp;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2016, Intel Corporation
|
* Copyright (c) 2015-2017, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -320,23 +320,6 @@ bool ue2_literal::any_nocase() const {
|
|||||||
return find(nocase.begin(), nocase.end(), true) != nocase.end();
|
return find(nocase.begin(), nocase.end(), true) != nocase.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mixed_sensitivity(const ue2_literal &s) {
|
|
||||||
bool cs = false;
|
|
||||||
bool nc = false;
|
|
||||||
for (ue2_literal::const_iterator it = s.begin(); it != s.end(); ++it) {
|
|
||||||
if (!ourisalpha(it->c)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (it->nocase) {
|
|
||||||
nc = true;
|
|
||||||
} else {
|
|
||||||
cs = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return cs && nc;
|
|
||||||
}
|
|
||||||
|
|
||||||
void make_nocase(ue2_literal *lit) {
|
void make_nocase(ue2_literal *lit) {
|
||||||
ue2_literal rv;
|
ue2_literal rv;
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
|
|
||||||
#include "ue2common.h"
|
#include "ue2common.h"
|
||||||
#include "util/charreach.h"
|
#include "util/charreach.h"
|
||||||
|
#include "util/compare.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
|
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
@ -226,9 +227,36 @@ size_t maxStringSelfOverlap(const ue2_literal &a);
|
|||||||
size_t minStringPeriod(const ue2_literal &a);
|
size_t minStringPeriod(const ue2_literal &a);
|
||||||
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
|
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
|
||||||
|
|
||||||
/** \brief True iff the literal cannot be considered entirely case-sensitive
|
/**
|
||||||
* nor entirely case-insensitive */
|
* \brief True iff the range of a literal given cannot be considered entirely
|
||||||
bool mixed_sensitivity(const ue2_literal &lit);
|
* case-sensitive nor entirely case-insensitive.
|
||||||
|
*/
|
||||||
|
template<class Iter>
|
||||||
|
bool mixed_sensitivity_in(Iter begin, Iter end) {
|
||||||
|
bool cs = false;
|
||||||
|
bool nc = false;
|
||||||
|
for (auto it = begin; it != end; ++it) {
|
||||||
|
if (!ourisalpha(it->c)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (it->nocase) {
|
||||||
|
nc = true;
|
||||||
|
} else {
|
||||||
|
cs = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cs && nc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief True iff the literal cannot be considered entirely case-sensitive
|
||||||
|
* nor entirely case-insensitive.
|
||||||
|
*/
|
||||||
|
inline
|
||||||
|
bool mixed_sensitivity(const ue2_literal &s) {
|
||||||
|
return mixed_sensitivity_in(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
void make_nocase(ue2_literal *lit);
|
void make_nocase(ue2_literal *lit);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user