2023-05-11 18:54:44 +00:00

410 lines
13 KiB
C++

#include "single_keyword.h"
#include "output.h"
#include "debug.h"
#include <map>
#include <strings.h>
using namespace std;
USE_DEBUG_FLAG(D_KEYWORD);
class DataKeyword : public SingleKeyword
{
public:
explicit DataKeyword(const vector<KeywordAttr> &attr, const VariablesMapping &vars);
MatchStatus isMatch(const I_KeywordRuntimeState* prev) const override;
private:
void
setOffset(const KeywordAttr &attr, const VariablesMapping &vars)
{
offset.setAttr(attr, vars, "data");
}
void
setDepth(const KeywordAttr &attr, const VariablesMapping &vars)
{
depth.setAttr(attr, vars, "data");
}
void
setCaret(const KeywordAttr &attr, const VariablesMapping &)
{
is_caret.setAttr(attr, "data");
}
void
setRelative(const KeywordAttr &attr, const VariablesMapping &)
{
is_relative.setAttr(attr, "data");
}
void
setCaseInsensitive(const KeywordAttr &attr, const VariablesMapping &)
{
is_case_insensitive.setAttr(attr, "data");
}
void
setContext(const KeywordAttr &attr, const VariablesMapping &)
{
ctx.setAttr(attr, "data");
}
void parseString(const string &str);
void
addChar(char ch)
{
pattern.push_back(static_cast<unsigned char>(ch));
}
void calcTables();
pair<uint, uint> getStartAndEndOffsets(uint buf_size, const I_KeywordRuntimeState *prev) const;
uint bytesMatched(const Buffer&, uint) const;
uint
moveOnMatch() const
{
return pattern.size();
}
uint
moveOnNoMatch(uint offset_from_end, unsigned char first_unmatched_byte) const
{
dbgAssert(shift.size() > offset_from_end) << "Shift table of the 'data' keyword is shorter than the offset";
uint skip_size;
if (skip[first_unmatched_byte]>offset_from_end) {
skip_size = skip[first_unmatched_byte]-offset_from_end;
} else {
skip_size = 1;
}
return max(shift[offset_from_end], skip_size);
}
bool
isConstant() const
{
return !is_relative && offset.isConstant() && depth.isConstant();
}
vector<unsigned char> pattern;
uint skip[256];
vector<uint> shift;
NumericAttr offset;
NumericAttr depth;
BoolAttr is_negative;
BoolAttr is_caret;
BoolAttr is_relative;
BoolAttr is_case_insensitive;
CtxAttr ctx;
static const map<string, void(DataKeyword::*)(const KeywordAttr &, const VariablesMapping &)> setops;
};
const map<string, void(DataKeyword::*)(const KeywordAttr &, const VariablesMapping &)> DataKeyword::setops = {
{ "relative", &DataKeyword::setRelative },
{ "offset", &DataKeyword::setOffset },
{ "depth", &DataKeyword::setDepth },
{ "caret", &DataKeyword::setCaret },
{ "nocase", &DataKeyword::setCaseInsensitive },
{ "part", &DataKeyword::setContext }
};
DataKeyword::DataKeyword(const vector<KeywordAttr> &attrs, const VariablesMapping &vars)
:
offset(),
depth()
{
auto &pattern_param = attrs[0].getParams();
if (pattern_param.size() != 1) throw KeywordError("More than one element in the 'data' keyword pattern");
const string &string_pattern = pattern_param[0];
if (string_pattern.length() == 0) throw KeywordError("No input for the 'data' keyword");
uint start = 0;
if (string_pattern[0] == '!') {
is_negative.setAttr("data", "negative");
start++;
}
if (string_pattern[start] != '"') throw KeywordError("The data pattern does not begin with '\"'");
uint end = string_pattern.length()-1;
if (string_pattern[end] != '"') throw KeywordError("The data pattern does not end with '\"'");
if (start+1 >= end) throw KeywordError("No input for the 'data' keyword");
parseString(string_pattern.substr(start+1, end-(start+1)));
for (uint i = 1; i<attrs.size(); i++) {
auto curr = setops.find(attrs[i].getAttrName());
if (curr == setops.end()) {
throw KeywordError("Unknown attribute '" + attrs[i].getAttrName() + "' in the 'data' keyword");
}
auto set_func = curr->second;
(this->*set_func)(attrs[i], vars);
}
calcTables();
}
void
DataKeyword::calcTables()
{
if (is_case_insensitive) {
for (auto &ch : pattern) {
if (isupper(ch)) {
ch = tolower(ch);
}
}
}
// Initialize skip table - when we meet a charecter that isn't in the pattern, we skip the whole pattern
for (auto &ch_skip : skip) {
ch_skip = pattern.size();
}
// Go over the charecters in the pattern.
// We can skip from a charecter to the end of the pattern.
// If a charecter appear more than once, the latest occurence take precedent.
for (uint index = 0; index<pattern.size(); index++) {
unsigned char ch = pattern[index];
uint dist_to_end = pattern.size()-(index+1);
if (is_case_insensitive && islower(ch)) {
skip[toupper(ch)] = dist_to_end;
}
skip[ch] = dist_to_end;
}
// Initialize the shift table.
shift.resize(pattern.size(), 0);
uint end_offset = pattern.size()-1;
// Go over all the suffixes (from the empty to the pattern-1)
for (size_t suffix_len = 0; suffix_len<pattern.size(); suffix_len++) {
// Find the smallest shift, so when shifting the suffix left:
// 1. All chars overlapping between pattern and shifted suffix match.
// 2. If the character before the shifted suffix overlaps the pattern, it doesn't match.
// pattern = "hellololo", suff=2 (must match "[^o]lo"), shift=4 ("hel(lo)lolo")
// pattern = "olo" suff=2 (must match "[^o]lo"), shift=2 ("(.o)lo")
// characters before the patterns are considered wild.
for (uint shift_offset = 1; shift_offset<=pattern.size(); shift_offset++) {
// Verify that in the current offset matches the suffix
size_t num_of_overlapping_char;
unsigned char *suffix_start_ptr;
unsigned char *shifted_suffix_start_ptr;
if (shift_offset+suffix_len <= pattern.size()) {
// Shifted suffix doesn't exceed the pattern. Compare the whole suffix.
num_of_overlapping_char = suffix_len;
suffix_start_ptr = pattern.data() + pattern.size() - suffix_len;
shifted_suffix_start_ptr = suffix_start_ptr - shift_offset;
} else {
// Shifted suffix exceeds the pattern. Compare only the overlaping charecters.
num_of_overlapping_char = pattern.size() - shift_offset;
suffix_start_ptr = pattern.data() + shift_offset;
shifted_suffix_start_ptr = pattern.data();
}
if (bcmp(suffix_start_ptr, shifted_suffix_start_ptr, num_of_overlapping_char) != 0) continue;
// Verify that what comes after the suffix doesn't match
if (shift_offset+suffix_len < pattern.size()) {
if (pattern[end_offset-suffix_len] == pattern[end_offset-(shift_offset+suffix_len)]) continue;
}
// Set the currect shift offset
shift[suffix_len] = shift_offset;
break;
}
}
}
void
DataKeyword::parseString(const string &str)
{
string hex;
bool hex_mode = false;
bool after_bslash = false;
for (auto ch : str) {
if (after_bslash) {
if (!isprint(ch)) {
throw KeywordError(
"Illegal backslash character '" +
dumpHexChar(ch) +
"' in the pattern in the 'data' keyword"
);
}
addChar(ch);
after_bslash = false;
continue;
}
switch (ch) {
case '|': {
if (!hex_mode) {
hex = "";
hex_mode = true;
} else {
if (hex.size()>0) throw KeywordError("Stoping in the middle of hex string in the 'data' keyword");
hex_mode = false;
}
break;
}
case '\\': {
if (hex_mode) throw KeywordError("Backslash in hex string in the 'data' keyword");
after_bslash = true;
break;
}
case '"': {
throw KeywordError("Unescaped double quotation mark in the 'data' keyword");
break;
}
default:
if (hex_mode) {
if (!isxdigit(ch)) {
if (ch != ' ') {
throw KeywordError(
"Illegal character '" +
dumpHexChar(ch) +
"' in the hex string in the 'data' keyword"
);
}
if (hex.size()>0) {
throw KeywordError("Space separating nibbles in the hex string in the 'data' keyword");
}
break;
}
hex += ch;
if (hex.size()>=2) {
addChar(stol(hex, nullptr, 16));
hex = "";
}
} else {
if (!isprint(ch)) {
throw KeywordError(
"Illegal character '" +
dumpHexChar(ch) +
"' in the pattern in the 'data' keyword"
);
}
addChar(ch);
}
}
}
if ( hex_mode || after_bslash ) {
throw KeywordError("The 'data' keyword's pattern has ended in the middle of the parsing");
}
}
static uint
addOffset(uint offset, int add)
{
if (add<0 && offset<static_cast<uint>(-add)) return 0;
return offset + add;
}
pair<uint, uint>
DataKeyword::getStartAndEndOffsets(uint buf_size, const I_KeywordRuntimeState *prev) const
{
uint relative_offset = is_relative?prev->getOffset(ctx):0;
int offset_attr = offset.evalAttr(prev);
uint start_offset = addOffset(relative_offset, offset_attr);
if (depth.isSet()) {
uint depth_size = addOffset(start_offset, depth.evalAttr(prev));
buf_size = std::min(buf_size, depth_size);
}
if (is_caret) {
buf_size = std::min(buf_size, start_offset+static_cast<uint>(pattern.size()));
}
return make_pair(start_offset, buf_size);
}
uint
DataKeyword::bytesMatched(const Buffer &buf, uint offset) const
{
if (is_case_insensitive) {
for (uint i = 0; i<pattern.size(); i++) {
if (pattern[pattern.size()-(i+1)] != tolower(buf[offset-(i+1)])) return i;
}
} else {
for (uint i = 0 ; i < pattern.size() ; i++ ) {
if (pattern[pattern.size()-(i+1)] != buf[offset-(i+1)] ) return i;
}
}
return pattern.size();
}
MatchStatus
DataKeyword::isMatch(const I_KeywordRuntimeState *prev) const
{
dbgAssert(pattern.size()>0) << "Trying to run on an uninitialized keyword data";
dbgDebug(D_KEYWORD) << "Searching for " << dumpHex(pattern);
auto part = Singleton::Consume<I_Environment>::by<KeywordComp>()->get<Buffer>(static_cast<string>(ctx));
if (!part.ok()) {
if (is_negative) return runNext(prev);
return MatchStatus::NoMatchFinal;
}
const auto &buf = part.unpack();
dbgTrace(D_KEYWORD) << "Full buffer: " << dumpHex(buf);
uint offset, max_offset;
tie(offset, max_offset) = getStartAndEndOffsets(buf.size(), prev);
offset += pattern.size();
bool match_found = false;
while (offset<=max_offset) {
// Short circuit for the common, simple case where the last byte doesn't match
if (skip[buf[offset-1]]) {
offset += skip[buf[offset - 1]];
continue;
}
// Full search Boyer-Moore
uint match_size = bytesMatched(buf, offset);
if (match_size == pattern.size()) {
if (is_negative) {
return isConstant()?MatchStatus::NoMatchFinal:MatchStatus::NoMatch;
}
match_found = true;
OffsetRuntimeState new_offset(prev, ctx, offset);
auto next_keyword_result = runNext(&new_offset);
if (next_keyword_result!=MatchStatus::NoMatch) return next_keyword_result;
offset += moveOnMatch();
} else {
offset += moveOnNoMatch(match_size, buf[offset-(match_size+1)]);
}
}
// No matchs is a success for negative keywords
if (is_negative && !match_found) return runNext(prev);
// If there were no matchs and the keyword is an effected by other keywords, then we know that the rule won't match
if (isConstant() && !match_found) return MatchStatus::NoMatchFinal;
return MatchStatus::NoMatch;
}
unique_ptr<SingleKeyword>
genDataKeyword(const vector<KeywordAttr> &attr, VariablesMapping &known_vars)
{
return make_unique<DataKeyword>(attr, known_vars);
}