rose_build_matchers: be more careful w/ mixed-case

Overhaul the way fragment literals are added to HWLM and accel, fix
some bugs shaken out by stricter mask use.
This commit is contained in:
Justin Viiret 2017-06-22 10:37:31 +10:00 committed by Matthew Barr
parent 3bd0c7f6ad
commit c83f2ea389
3 changed files with 155 additions and 95 deletions

View File

@ -395,7 +395,11 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
static
void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
vector<u8> &cmp) {
if (id.msk.empty() && !mixed_sensitivity(id.s)) {
const size_t suffix_len = min(id.s.length(), size_t{HWLM_MASKLEN});
bool mixed_suffix = mixed_sensitivity_in(id.s.end() - suffix_len,
id.s.end());
if (id.msk.empty() && !mixed_suffix) {
return;
}
@ -415,10 +419,9 @@ void addLiteralMask(const rose_literal_id &id, vector<u8> &msk,
}
}
if (mixed_sensitivity(id.s)) {
if (mixed_suffix) {
auto it = id.s.rbegin();
for (size_t i = 0, i_end = min(id.s.length(), size_t{HWLM_MASKLEN});
i < i_end; ++i, ++it) {
for (size_t i = 0; i < suffix_len; ++i, ++it) {
const auto &c = *it;
if (!c.nocase) {
size_t offset = HWLM_MASKLEN - i - 1;
@ -683,6 +686,81 @@ struct MatcherProto {
};
}
static
void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
const LitFragment &f, u32 id, bool delay_rebuild,
size_t max_len) {
const rose_literal_id &lit = build.literals.at(id);
assert(id < build.literal_info.size());
const auto &info = build.literal_info.at(id);
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(lit.s).c_str(),
lit.s.length());
vector<u8> msk = lit.msk; // copy
vector<u8> cmp = lit.cmp; // copy
bool noruns = isNoRunsLiteral(build, id, info, max_len);
auto lit_final = lit.s; // copy
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
DEBUG_PRINTF("truncating to tail of length %zu\n",
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
lit_final.erase(0, lit_final.length() - ROSE_SHORT_LITERAL_LEN_MAX);
// We shouldn't have set a threshold below 8 chars.
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
assert(!noruns);
}
addLiteralMask(lit, msk, cmp);
const auto &s_final = lit_final.get_string();
bool nocase = lit_final.any_nocase();
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n",
f.fragment_id, escapeString(s_final).c_str(), (int)nocase,
noruns, dumpMask(msk).c_str(), dumpMask(cmp).c_str());
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
return;
}
u32 prog_offset =
delay_rebuild ? f.delay_program_offset : f.lit_program_offset;
const auto &groups = f.groups;
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset, groups,
msk, cmp);
}
static
void addAccelLiteral(MatcherProto &mp, const rose_literal_id &lit,
const rose_literal_info &info, size_t max_len) {
const auto &s = lit.s; // copy
DEBUG_PRINTF("lit='%s' (len %zu)\n", dumpString(s).c_str(), s.length());
vector<u8> msk = lit.msk; // copy
vector<u8> cmp = lit.cmp; // copy
addLiteralMask(lit, msk, cmp);
if (!maskIsConsistent(s.get_string(), s.any_nocase(), msk, cmp)) {
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
return;
}
// Literals used for acceleration must be limited to max_len, as that's all
// we can see in history.
string s_final = lit.s.get_string();
trim_to_suffix(s_final, max_len);
trim_to_suffix(msk, max_len);
trim_to_suffix(cmp, max_len);
mp.accel_lits.emplace_back(s_final, lit.s.any_nocase(), msk, cmp,
info.group_mask);
}
/**
* \brief Build up a vector of literals (and associated other data) for the
* given table.
@ -702,27 +780,28 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
assert(build.cc.streaming);
}
for (const auto &f : fragments) {
for (u32 id : f.lit_ids) {
const rose_literal_id &lit = build.literals.at(id);
vector<u32> used_lit_ids;
if (lit.table != table) {
continue; /* wrong table */
for (const auto &f : fragments) {
assert(!f.lit_ids.empty());
// All literals that share a fragment are in the same table.
if (build.literals.at(f.lit_ids.front()).table != table) {
continue; // next fragment.
}
DEBUG_PRINTF("fragment %u, %zu lit_ids\n", f.fragment_id,
f.lit_ids.size());
used_lit_ids.clear();
for (u32 id : f.lit_ids) {
const rose_literal_id &lit = build.literals.at(id);
assert(id < build.literal_info.size());
const auto &info = build.literal_info.at(id);
if (lit.delay) {
continue; /* delay id's are virtual-ish */
}
assert(id < build.literal_info.size());
const auto &info = build.literal_info.at(id);
/* Note: requires_benefits are handled in the literal entries */
const ue2_literal &s = lit.s;
DEBUG_PRINTF("lit='%s' (len %zu)\n", escapeString(s).c_str(),
s.length());
// When building the delay rebuild table, we only want to include
// literals that have delayed variants.
if (delay_rebuild && info.delayed_ids.empty()) {
@ -739,69 +818,39 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
}
}
vector<u8> msk = lit.msk; // copy
vector<u8> cmp = lit.cmp; // copy
bool noruns = isNoRunsLiteral(build, id, info, max_len);
used_lit_ids.push_back(id);
}
size_t lit_hist_len = 0;
if (used_lit_ids.empty()) {
continue; // next fragment.
}
// Build our fragment (for the HWLM matcher) from the first literal.
addFragmentLiteral(build, mp, f, used_lit_ids.front(), delay_rebuild,
max_len);
for (u32 id : used_lit_ids) {
const rose_literal_id &lit = build.literals.at(id);
assert(id < build.literal_info.size());
const auto &info = build.literal_info.at(id);
// All literals contribute accel information.
addAccelLiteral(mp, lit, info, max_len);
// All literals contribute to history requirement in streaming mode.
if (build.cc.streaming) {
lit_hist_len = max(msk.size(), min(s.length(), max_len));
size_t lit_hist_len =
max(lit.msk.size(), min(lit.s.length(), max_len));
lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
}
DEBUG_PRINTF("lit requires %zu bytes of history\n", lit_hist_len);
DEBUG_PRINTF("lit requires %zu bytes of history\n",
lit_hist_len);
assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
auto lit_final = s; // copy
if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
DEBUG_PRINTF("truncating to tail of length %zu\n",
size_t{ROSE_SHORT_LITERAL_LEN_MAX});
lit_final.erase(0, lit_final.length()
- ROSE_SHORT_LITERAL_LEN_MAX);
// We shouldn't have set a threshold below 8 chars.
assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
assert(!noruns);
}
addLiteralMask(lit, msk, cmp);
const auto &s_final = lit_final.get_string();
bool nocase = lit_final.any_nocase();
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
"cmp=%s\n", f.fragment_id,
escapeString(s_final).c_str(), (int)nocase, noruns,
dumpMask(msk).c_str(), dumpMask(cmp).c_str());
if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
continue;
}
mp.accel_lits.emplace_back(s.get_string(), s.any_nocase(), msk, cmp,
info.group_mask);
mp.history_required = max(mp.history_required, lit_hist_len);
u32 prog_offset = delay_rebuild ? f.delay_program_offset
: f.lit_program_offset;
const auto &groups = f.groups;
mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset,
groups, msk, cmp);
}
}
}
sort_and_unique(mp.lits);
// Literals used for acceleration must be limited to max_len, as that's all
// we can see in history.
for_each(begin(mp.accel_lits), end(mp.accel_lits),
[&max_len](AccelString &a) {
trim_to_suffix(a.s, max_len);
trim_to_suffix(a.msk, max_len);
trim_to_suffix(a.cmp, max_len);
});
sort_and_unique(mp.accel_lits);
return mp;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -320,23 +320,6 @@ bool ue2_literal::any_nocase() const {
return find(nocase.begin(), nocase.end(), true) != nocase.end();
}
bool mixed_sensitivity(const ue2_literal &s) {
bool cs = false;
bool nc = false;
for (ue2_literal::const_iterator it = s.begin(); it != s.end(); ++it) {
if (!ourisalpha(it->c)) {
continue;
}
if (it->nocase) {
nc = true;
} else {
cs = true;
}
}
return cs && nc;
}
void make_nocase(ue2_literal *lit) {
ue2_literal rv;

View File

@ -35,6 +35,7 @@
#include "ue2common.h"
#include "util/charreach.h"
#include "util/compare.h"
#include "util/hash.h"
#include <iterator>
@ -226,9 +227,36 @@ size_t maxStringSelfOverlap(const ue2_literal &a);
size_t minStringPeriod(const ue2_literal &a);
size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
/** \brief True iff the literal cannot be considered entirely case-sensitive
* nor entirely case-insensitive */
bool mixed_sensitivity(const ue2_literal &lit);
/**
* \brief True iff the range of a literal given cannot be considered entirely
* case-sensitive nor entirely case-insensitive.
*/
template<class Iter>
bool mixed_sensitivity_in(Iter begin, Iter end) {
bool cs = false;
bool nc = false;
for (auto it = begin; it != end; ++it) {
if (!ourisalpha(it->c)) {
continue;
}
if (it->nocase) {
nc = true;
} else {
cs = true;
}
}
return cs && nc;
}
/**
* \brief True iff the literal cannot be considered entirely case-sensitive
* nor entirely case-insensitive.
*/
inline
bool mixed_sensitivity(const ue2_literal &s) {
return mixed_sensitivity_in(s.begin(), s.end());
}
void make_nocase(ue2_literal *lit);