vectorscan/src/rose/rose_build_dump.cpp
2017-04-26 14:41:29 +10:00

575 lines
16 KiB
C++

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "rose_build_dump.h"
#include "rose_build_impl.h"
#include "rose_build_matchers.h"
#include "rose/rose_dump.h"
#include "rose_internal.h"
#include "ue2common.h"
#include "hwlm/hwlm_literal.h"
#include "nfa/castlecompile.h"
#include "nfa/nfa_internal.h"
#include "nfagraph/ng_dump.h"
#include "som/slot_manager_dump.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
#include "util/ue2string.h"
#include <iomanip>
#include <ostream>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#ifndef DUMP_SUPPORT
#error No dump support!
#endif
using namespace std;
namespace ue2 {
/** \brief Return the kind of a left_id or a suffix_id. */
template<class Graph>
string render_kind(const Graph &g) {
if (g.graph()) {
return to_string(g.graph()->kind);
}
if (g.dfa()) {
return to_string(g.dfa()->kind);
}
if (g.haig()) {
return to_string(g.haig()->kind);
}
if (g.castle()) {
return to_string(g.castle()->kind);
}
return "UNKNOWN";
}
namespace {
class RoseGraphWriter {
public:
RoseGraphWriter(const RoseBuildImpl &b_in, const RoseEngine *t_in) :
build(b_in), t(t_in) {
for (const auto &m : build.ghost) {
ghost.insert(m.second);
}
}
void operator() (ostream &os, const RoseVertex &v) const {
const RoseGraph &g = build.g;
if (v == build.root) {
os << "[label=\"<root>\"]";
return;
}
if (v == build.anchored_root) {
os << "[label=\"<^>\"]";
return;
}
os << "[label=\"";
os << "index=" << g[v].index <<"\\n";
for (u32 lit_id : g[v].literals) {
writeLiteral(os, lit_id);
os << "\\n";
}
os << "min_offset=" << g[v].min_offset;
if (g[v].max_offset >= ROSE_BOUND_INF) {
os << ", max_offset=inf";
} else {
os << ", max_offset=" << g[v].max_offset;
}
os << "\\n";
if (!g[v].reports.empty()) {
if (g[v].eod_accept) {
os << "\\nACCEPT_EOD";
} else {
os << "\\nACCEPT";
}
os << " (rep=" << as_string_list(g[v].reports) << ")";
}
if (g[v].suffix) {
suffix_id suff(g[v].suffix);
os << "\\n" << render_kind(suff) << " (top " << g[v].suffix.top;
auto it = build.suffix_queue_map.find(suff);
if (it != end(build.suffix_queue_map)) {
os << ", queue " << it->second;
}
os << ")";
}
if (ghost.find(v) != ghost.end()) {
os << "\\nGHOST";
}
if (g[v].left) {
left_id left(g[v].left);
os << "\\n" << render_kind(left) << " (queue ";
auto it = build.leftfix_queue_map.find(left);
if (it != end(build.leftfix_queue_map)) {
os << it->second;
} else {
os << "??";
}
os << ", report " << g[v].left.leftfix_report << ")";
}
os << "\"";
// Roles with a rose prefix get a colour.
if (g[v].left) {
os << " color=violetred ";
}
// Our accepts get different colours.
if (!g[v].reports.empty()) {
os << " color=blue ";
}
if (g[v].suffix) {
os << " color=forestgreen ";
}
os << "]";
}
void operator() (ostream &os, const RoseEdge &e) const {
const RoseGraph &g = build.g;
// Render the bounds on this edge.
u32 minBound = g[e].minBound;
u32 maxBound = g[e].maxBound;
os << "[label=\"";
if (minBound == 0 && maxBound == ROSE_BOUND_INF) {
os << ".*";
} else if (minBound == 1 && maxBound == ROSE_BOUND_INF) {
os << ".+";
} else {
os << ".{" << minBound << ",";
if (maxBound != ROSE_BOUND_INF) {
os << maxBound;
}
os << "}";
}
// If we lead to an infix, display which top we're using.
RoseVertex v = target(e, g);
if (g[v].left) {
os << "\\nROSE TOP " << g[e].rose_top;
}
switch (g[e].history) {
case ROSE_ROLE_HISTORY_NONE:
break;
case ROSE_ROLE_HISTORY_ANCH:
os << "\\nANCH history";
break;
case ROSE_ROLE_HISTORY_LAST_BYTE:
os << "\\nLAST_BYTE history";
break;
case ROSE_ROLE_HISTORY_INVALID:
os << "\\nINVALID history";
break;
}
os << "\"]";
}
private:
// Render the literal associated with a vertex.
void writeLiteral(ostream &os, u32 id) const {
os << "lit=" << id;
if (id < build.literal_info.size()) {
os << "/" << build.literal_info[id].final_id << " ";
} else {
os << "/nofinal ";
}
if (contains(build.literals.right, id)) {
const auto &lit = build.literals.right.at(id);
os << '\'' << dotEscapeString(lit.s.get_string()) << '\'';
if (lit.s.any_nocase()) {
os << " (nocase)";
}
if (lit.delay) {
os << " +" << lit.delay;
}
} else {
os << "<unknown>";
}
}
set<RoseVertex> ghost;
const RoseBuildImpl &build;
const RoseEngine *t;
};
} // namespace
void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
const char *filename) {
const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl &>(build_base);
const Grey &grey = build.cc.grey;
/* "early" rose graphs should only be dumped if we are dumping intermediate
* graphs. Early graphs can be identified by the lack of a RoseEngine. */
u32 flag_test = t ? Grey::DUMP_IMPL : Grey::DUMP_INT_GRAPH;
if (!(grey.dumpFlags & flag_test)) {
return;
}
stringstream ss;
ss << grey.dumpPath << filename;
DEBUG_PRINTF("dumping graph to %s\n", ss.str().c_str());
ofstream os(ss.str());
RoseGraphWriter writer(build, t);
writeGraphviz(os, build.g, writer, get(boost::vertex_index, build.g));
}
namespace {
struct CompareVertexRole {
explicit CompareVertexRole(const RoseGraph &g_in) : g(g_in) {}
inline bool operator()(const RoseVertex &a, const RoseVertex &b) const {
return g[a].index < g[b].index;
}
private:
const RoseGraph &g;
};
}
static
void lit_graph_info(const RoseBuildImpl &build, const rose_literal_info &li,
u32 *min_offset, bool *in_root_role) {
*min_offset = ~0U;
*in_root_role = false;
for (auto v : li.vertices) {
*in_root_role |= build.isRootSuccessor(v);
LIMIT_TO_AT_MOST(min_offset, build.g[v].min_offset);
}
}
static
void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
const RoseGraph &g = build.g;
DEBUG_PRINTF("dumping literals\n");
ofstream os(filename);
os << "ROSE LITERALS: a total of " << build.literals.right.size()
<< " literals and " << num_vertices(g) << " roles." << endl << endl;
for (const auto &e : build.literals.right) {
u32 id = e.first;
const ue2_literal &s = e.second.s;
const rose_literal_info &lit_info = build.literal_info[id];
switch (e.second.table) {
case ROSE_ANCHORED:
os << "ANCHORED";
break;
case ROSE_FLOATING:
os << "FLOATING";
break;
case ROSE_EOD_ANCHORED:
os << "EOD-ANCHORED";
break;
case ROSE_ANCHORED_SMALL_BLOCK:
os << "SMALL-BLOCK";
break;
case ROSE_EVENT:
os << "EVENT";
break;
}
os << " ID " << id << "/" << lit_info.final_id << ": \""
<< escapeString(s.get_string()) << "\""
<< " (len " << s.length() << ",";
if (s.any_nocase()) {
os << " nocase,";
}
if (lit_info.requires_benefits) {
os << " benefits,";
}
if (e.second.delay) {
os << " delayed "<< e.second.delay << ",";
}
os << " groups 0x" << hex << setw(16) << setfill('0')
<< lit_info.group_mask << dec << ",";
if (lit_info.squash_group) {
os << " squashes group,";
}
u32 min_offset;
bool in_root_role;
lit_graph_info(build, lit_info, &min_offset, &in_root_role);
os << " min offset " << min_offset;
if (in_root_role) {
os << " root literal";
}
os << ") roles=" << lit_info.vertices.size() << endl;
if (!lit_info.delayed_ids.empty()) {
os << " Children:";
for (u32 d_id : lit_info.delayed_ids) {
os << " " << d_id;
}
os << endl;
}
// Temporary vector, so that we can sort the output by role.
vector<RoseVertex> verts(lit_info.vertices.begin(),
lit_info.vertices.end());
sort(verts.begin(), verts.end(), CompareVertexRole(g));
for (RoseVertex v : verts) {
// role info
os << " Index " << g[v].index << ": groups=0x" << hex << setw(16)
<< setfill('0') << g[v].groups << dec;
if (g[v].reports.empty()) {
os << ", report=NONE";
} else {
os << ", report={" << as_string_list(g[v].reports) << "}";
}
os << ", min_offset=" << g[v].min_offset;
os << ", max_offset=" << g[v].max_offset << endl;
// pred info
for (const auto &ie : in_edges_range(v, g)) {
const auto &u = source(ie, g);
os << " Predecessor index=";
if (u == build.root) {
os << "ROOT";
} else if (u == build.anchored_root) {
os << "ANCHORED_ROOT";
} else {
os << g[u].index;
}
os << ": bounds [" << g[ie].minBound << ", ";
if (g[ie].maxBound == ROSE_BOUND_INF) {
os << "inf";
} else {
os << g[ie].maxBound;
}
os << "]" << endl;
}
}
}
os.close();
}
template<class Iter>
static
string toHex(Iter i, const Iter &end) {
ostringstream oss;
for (; i != end; ++i) {
u8 c = *i;
oss << hex << setw(2) << setfill('0') << ((unsigned)c & 0xff);
}
return oss.str();
}
static
bool isMetaChar(char c) {
switch (c) {
case '#':
case '$':
case '(':
case ')':
case '*':
case '+':
case '.':
case '/':
case '?':
case '[':
case '\\':
case ']':
case '^':
case '{':
case '|':
case '}':
return true;
default:
return false;
}
}
static
string toRegex(const string &lit) {
ostringstream os;
for (char c : lit) {
if (0x20 <= c && c <= 0x7e) {
if (isMetaChar(c)) {
os << "\\" << c;
} else {
os << c;
}
} else if (c == '\n') {
os << "\\n";
} else if (c == '\r') {
os << "\\r";
} else if (c == '\t') {
os << "\\t";
} else {
os << "\\x" << hex << setw(2) << setfill('0')
<< (unsigned)(c & 0xff) << dec;
}
}
return os.str();
}
static
void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
ofstream of(filename.c_str());
// Unique regex index, as literals may share an ID.
u32 i = 0;
for (const hwlmLiteral &lit : lits) {
// First, detail in a comment.
of << "# id=" << lit.id;
if (!lit.msk.empty()) {
of << " msk=0x" << toHex(lit.msk.begin(), lit.msk.end());
of << " cmp=0x" << toHex(lit.cmp.begin(), lit.cmp.end());
}
of << " groups=0x" << hex << setfill('0') << lit.groups << dec;
if (lit.noruns) {
of << " noruns";
}
of << endl;
// Second, literal rendered as a regex.
of << i << ":/" << toRegex(lit.s) << (lit.nocase ? "/i" : "/");
of << endl;
i++;
}
of.close();
}
static
void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
size_t historyRequired = build.calcHistoryRequired();
size_t longLitLengthThreshold =
calcLongLitThreshold(build, historyRequired);
const auto final_to_frag_map = groupByFragment(build);
auto mp = makeMatcherProto(build, final_to_frag_map, ROSE_ANCHORED,
longLitLengthThreshold);
dumpTestLiterals(base + "rose_anchored_test_literals.txt", mp.lits);
mp = makeMatcherProto(build, final_to_frag_map, ROSE_FLOATING,
longLitLengthThreshold);
dumpTestLiterals(base + "rose_float_test_literals.txt", mp.lits);
mp = makeMatcherProto(build, final_to_frag_map, ROSE_EOD_ANCHORED,
build.ematcher_region_size);
dumpTestLiterals(base + "rose_eod_test_literals.txt", mp.lits);
if (!build.cc.streaming) {
mp = makeMatcherProto(build, final_to_frag_map, ROSE_FLOATING,
ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
auto mp2 = makeMatcherProto(build, final_to_frag_map,
ROSE_ANCHORED_SMALL_BLOCK,
ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
mp.lits.insert(end(mp.lits), begin(mp2.lits), end(mp2.lits));
dumpTestLiterals(base + "rose_smallblock_test_literals.txt", mp.lits);
}
}
void dumpRose(const RoseBuild &build_base, const RoseEngine *t,
const Grey &grey) {
if (!grey.dumpFlags) {
return;
}
const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl&>(build_base);
stringstream ss;
ss << grey.dumpPath << "rose.txt";
FILE *f = fopen(ss.str().c_str(), "w");
if (!t) {
fprintf(f, "<< no rose >>\n");
fclose(f);
return;
}
// Dump Rose table info
roseDumpText(t, f);
fclose(f);
roseDumpComponents(t, false, grey.dumpPath);
// Graph.
dumpRoseGraph(build, t, "rose.dot");
// Literals.
ss.str("");
ss.clear();
ss << grey.dumpPath << "rose_literals.txt";
dumpRoseLiterals(build, ss.str().c_str());
dumpRoseTestLiterals(build, grey.dumpPath);
f = fopen((grey.dumpPath + "/rose_struct.txt").c_str(), "w");
roseDumpStructRaw(t, f);
fclose(f);
}
} // namespace ue2