mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-15 17:02:14 +03:00
add new Literal API for pure literal expressions:
Design compile time api hs_compile_lit() and hs_compile_lit_multi() to handle pure literal pattern sets. Corresponding option --literal-on is added for hyperscan testing suites. Extended parameters and part of flags are not supported for this api.
This commit is contained in:
committed by
Chang, Harry
parent
8bfbf07f75
commit
23e5f06594
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2018, Intel Corporation
|
||||
* Copyright (c) 2016-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -41,6 +41,7 @@ extern unsigned int somPrecisionMode;
|
||||
extern bool forceEditDistance;
|
||||
extern unsigned editDistance;
|
||||
extern bool printCompressSize;
|
||||
extern bool useLiteralApi;
|
||||
|
||||
/** Structure for the result of a single complete scan. */
|
||||
struct ResultEntry {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2018, Intel Corporation
|
||||
* Copyright (c) 2016-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -411,22 +411,30 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
|
||||
ext_ptr[i] = &ext[i];
|
||||
}
|
||||
|
||||
Timer timer;
|
||||
timer.start();
|
||||
|
||||
hs_compile_error_t *compile_err;
|
||||
Timer timer;
|
||||
|
||||
#ifndef RELEASE_BUILD
|
||||
err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(),
|
||||
ext_ptr.data(), count, full_mode, nullptr,
|
||||
&db, &compile_err, grey);
|
||||
#else
|
||||
err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.data(),
|
||||
ext_ptr.data(), count, full_mode, nullptr,
|
||||
&db, &compile_err);
|
||||
#endif
|
||||
if (useLiteralApi) {
|
||||
// Pattern length computation should be done before timer start.
|
||||
vector<size_t> lens(count);
|
||||
for (unsigned int i = 0; i < count; i++) {
|
||||
lens[i] = strlen(patterns[i]);
|
||||
}
|
||||
timer.start();
|
||||
err = hs_compile_lit_multi_int(patterns.data(), flags.data(),
|
||||
ids.data(), ext_ptr.data(),
|
||||
lens.data(), count, full_mode,
|
||||
nullptr, &db, &compile_err, grey);
|
||||
timer.complete();
|
||||
} else {
|
||||
timer.start();
|
||||
err = hs_compile_multi_int(patterns.data(), flags.data(),
|
||||
ids.data(), ext_ptr.data(), count,
|
||||
full_mode, nullptr, &db, &compile_err,
|
||||
grey);
|
||||
timer.complete();
|
||||
}
|
||||
|
||||
timer.complete();
|
||||
compileSecs = timer.seconds();
|
||||
peakMemorySize = getPeakHeap();
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2018, Intel Corporation
|
||||
* Copyright (c) 2016-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -87,6 +87,7 @@ unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
|
||||
bool forceEditDistance = false;
|
||||
unsigned editDistance = 0;
|
||||
bool printCompressSize = false;
|
||||
bool useLiteralApi = false;
|
||||
|
||||
// Globals local to this file.
|
||||
static bool compressStream = false;
|
||||
@@ -218,6 +219,7 @@ void usage(const char *error) {
|
||||
printf(" --per-scan Display per-scan Mbit/sec results.\n");
|
||||
printf(" --echo-matches Display all matches that occur during scan.\n");
|
||||
printf(" --sql-out FILE Output sqlite db.\n");
|
||||
printf(" --literal-on Use Hyperscan pure literal matching.\n");
|
||||
printf(" -S NAME Signature set name (for sqlite db).\n");
|
||||
printf("\n\n");
|
||||
|
||||
@@ -250,6 +252,7 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
|
||||
int do_echo_matches = 0;
|
||||
int do_sql_output = 0;
|
||||
int option_index = 0;
|
||||
int literalFlag = 0;
|
||||
vector<string> sigFiles;
|
||||
|
||||
static struct option longopts[] = {
|
||||
@@ -257,6 +260,7 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
|
||||
{"echo-matches", no_argument, &do_echo_matches, 1},
|
||||
{"compress-stream", no_argument, &do_compress, 1},
|
||||
{"sql-out", required_argument, &do_sql_output, 1},
|
||||
{"literal-on", no_argument, &literalFlag, 1},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
|
||||
@@ -463,6 +467,8 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
|
||||
loadSignatureList(file, sigs);
|
||||
sigSets.emplace_back(file, move(sigs));
|
||||
}
|
||||
|
||||
useLiteralApi = (bool)literalFlag;
|
||||
}
|
||||
|
||||
/** Start the global timer. */
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -92,6 +92,7 @@ bool g_allSignatures = false;
|
||||
bool g_forceEditDistance = false;
|
||||
bool build_sigs = false;
|
||||
bool check_logical = false;
|
||||
bool use_literal_api = false;
|
||||
unsigned int g_signature;
|
||||
unsigned int g_editDistance;
|
||||
unsigned int globalFlags = 0;
|
||||
@@ -322,11 +323,26 @@ void checkExpression(UNUSED void *threadarg) {
|
||||
#if !defined(RELEASE_BUILD)
|
||||
// This variant is available in non-release builds and allows us to
|
||||
// modify greybox settings.
|
||||
err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode,
|
||||
nullptr, &db, &compile_err, *g_grey);
|
||||
if (use_literal_api) {
|
||||
size_t len = strlen(regexp);
|
||||
err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp,
|
||||
&len, 1, mode, nullptr, &db,
|
||||
&compile_err, *g_grey);
|
||||
} else {
|
||||
err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1,
|
||||
mode, nullptr, &db, &compile_err,
|
||||
*g_grey);
|
||||
}
|
||||
#else
|
||||
err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode,
|
||||
nullptr, &db, &compile_err);
|
||||
if (use_literal_api) {
|
||||
size_t len = strlen(regexp);
|
||||
err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp,
|
||||
&len, 1, mode, nullptr, &db,
|
||||
&compile_err, *g_grey);
|
||||
} else {
|
||||
err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1,
|
||||
mode, nullptr, &db, &compile_err);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (err == HS_SUCCESS) {
|
||||
@@ -381,6 +397,11 @@ void checkLogicalExpression(UNUSED void *threadarg) {
|
||||
|
||||
ExprExtMap::const_iterator it;
|
||||
while (getNextLogicalExpression(it)) {
|
||||
if (use_literal_api) {
|
||||
recordSuccess(g_exprMap, it->first);
|
||||
continue;
|
||||
}
|
||||
|
||||
const ParsedExpr &comb = it->second;
|
||||
|
||||
vector<unsigned> subIds;
|
||||
@@ -470,6 +491,7 @@ void usage() {
|
||||
<< " -h Display this help." << endl
|
||||
<< " -B Build signature set." << endl
|
||||
<< " -C Check logical combinations (default: off)." << endl
|
||||
<< " --literal-on Processing pure literals, no need to check." << endl
|
||||
<< endl;
|
||||
}
|
||||
|
||||
@@ -477,9 +499,15 @@ static
|
||||
void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
|
||||
const char options[] = "e:E:s:z:hHLNV8G:T:BC";
|
||||
bool signatureSet = false;
|
||||
int literalFlag = 0;
|
||||
|
||||
static struct option longopts[] = {
|
||||
{"literal-on", no_argument, &literalFlag, 1},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
|
||||
for (;;) {
|
||||
int c = getopt_long(argc, argv, options, nullptr, nullptr);
|
||||
int c = getopt_long(argc, argv, options, longopts, nullptr);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
@@ -539,6 +567,9 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
|
||||
case 'C':
|
||||
check_logical = true;
|
||||
break;
|
||||
case 0:
|
||||
case 1:
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
exit(1);
|
||||
@@ -564,6 +595,8 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
use_literal_api = (bool)literalFlag;
|
||||
}
|
||||
|
||||
static
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "parser/Parser.h"
|
||||
#include "parser/parse_error.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "util/unordered.h"
|
||||
|
||||
@@ -111,6 +112,15 @@ bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
|
||||
return false;
|
||||
}
|
||||
|
||||
if (use_literal_api) {
|
||||
// filter out flags not supported by pure literal API.
|
||||
u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
|
||||
HS_FLAG_UCP | HS_FLAG_PREFILTER;
|
||||
hs_flags &= ~not_supported;
|
||||
force_utf8 = false;
|
||||
force_prefilter = false;
|
||||
}
|
||||
|
||||
expr.swap(regex);
|
||||
|
||||
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som,
|
||||
@@ -260,9 +270,29 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
|
||||
throw PcreCompileFailure("Unable to decode flags.");
|
||||
}
|
||||
|
||||
// When hyperscan literal api is on, transfer the regex string into hex.
|
||||
if (use_literal_api && !combination) {
|
||||
unsigned char *pat
|
||||
= reinterpret_cast<unsigned char *>(const_cast<char *>(re.c_str()));
|
||||
char *str = makeHex(pat, re.length());
|
||||
if (!str) {
|
||||
throw PcreCompileFailure("makeHex() malloc failure.");
|
||||
}
|
||||
re.assign(str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
// filter out flags not supported by PCRE
|
||||
u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
|
||||
HS_EXT_FLAG_MIN_LENGTH;
|
||||
if (use_literal_api) {
|
||||
ext.flags &= 0ULL;
|
||||
ext.min_offset = 0;
|
||||
ext.max_offset = MAX_OFFSET;
|
||||
ext.min_length = 0;
|
||||
ext.edit_distance = 0;
|
||||
ext.hamming_distance = 0;
|
||||
}
|
||||
if (ext.flags & ~supported) {
|
||||
// edit distance is a known unsupported flag, so just throw a soft error
|
||||
if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
|
||||
@@ -314,7 +344,6 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
|
||||
return compiled;
|
||||
}
|
||||
|
||||
|
||||
compiled->bytecode =
|
||||
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "ng_corpus_generator.h"
|
||||
#include "NfaGeneratedCorpora.h"
|
||||
#include "ExpressionParser.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "grey.h"
|
||||
#include "hs_compile.h"
|
||||
@@ -44,6 +45,7 @@
|
||||
#include "util/compile_context.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/report_manager.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/target_info.h"
|
||||
|
||||
#include <string>
|
||||
@@ -80,6 +82,18 @@ void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
|
||||
throw CorpusFailure("Expression could not be read: " + i->second);
|
||||
}
|
||||
|
||||
// When hyperscan literal api is on, transfer the regex string into hex.
|
||||
if (use_literal_api && !(hs_flags & HS_FLAG_COMBINATION)) {
|
||||
unsigned char *pat
|
||||
= reinterpret_cast<unsigned char *>(const_cast<char *>(re.c_str()));
|
||||
char *str = makeHex(pat, re.length());
|
||||
if (!str) {
|
||||
throw CorpusFailure("makeHex() malloc failure.");
|
||||
}
|
||||
re.assign(str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
// Combination's corpus is consist of sub-expressions' corpuses.
|
||||
if (hs_flags & HS_FLAG_COMBINATION) {
|
||||
ParsedLogical pl;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -925,11 +925,22 @@ compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
|
||||
const unsigned count = patterns.size();
|
||||
hs_database_t *db = nullptr;
|
||||
hs_compile_error_t *compile_err;
|
||||
hs_error_t err;
|
||||
|
||||
hs_error_t err = hs_compile_multi_int(&patterns[0], &flags[0],
|
||||
&idsvec[0], ext.c_array(), count,
|
||||
mode, platform, &db,
|
||||
&compile_err, grey);
|
||||
if (use_literal_api) {
|
||||
// Compute length of each pattern.
|
||||
vector<size_t> lens(count);
|
||||
for (unsigned int i = 0; i < count; i++) {
|
||||
lens[i] = strlen(patterns[i]);
|
||||
}
|
||||
err = hs_compile_lit_multi_int(&patterns[0], &flags[0], &idsvec[0],
|
||||
ext.c_array(), &lens[0], count, mode,
|
||||
platform, &db, &compile_err, grey);
|
||||
} else {
|
||||
err = hs_compile_multi_int(&patterns[0], &flags[0], &idsvec[0],
|
||||
ext.c_array(), count, mode, platform, &db,
|
||||
&compile_err, grey);
|
||||
}
|
||||
|
||||
if (err != HS_SUCCESS) {
|
||||
error = compile_err->message;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -116,6 +116,7 @@ void usage(const char *name, const char *error) {
|
||||
printf(" --abort-on-fail Abort, rather than exit, on failure.\n");
|
||||
printf(" --no-signal-handler Do not handle handle signals (to generate "
|
||||
"backtraces).\n");
|
||||
printf(" --literal-on Use Hyperscan pure literal matching.\n");
|
||||
printf("\n");
|
||||
printf("Memory and resource control options:\n");
|
||||
printf("\n");
|
||||
@@ -174,6 +175,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
|
||||
int mangleScratch = 0;
|
||||
int compressFlag = 0;
|
||||
int compressResetFlag = 0;
|
||||
int literalFlag = 0;
|
||||
static const struct option longopts[] = {
|
||||
{"copy-scratch", 0, ©Scratch, 1},
|
||||
{"copy-stream", 0, ©Stream, 1},
|
||||
@@ -187,6 +189,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
|
||||
{"compress-expand", 0, &compressFlag, 1},
|
||||
{"compress-reset-expand", 0, &compressResetFlag, 1},
|
||||
{"no-groups", 0, &no_groups, 1},
|
||||
{"literal-on", 0, &literalFlag, 1},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
for (;;) {
|
||||
@@ -589,4 +592,5 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
|
||||
use_mangle_scratch = (bool) mangleScratch;
|
||||
use_compress_expand = (bool)compressFlag;
|
||||
use_compress_reset_expand = (bool)compressResetFlag;
|
||||
use_literal_api = (bool)literalFlag;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -82,6 +82,7 @@ extern bool use_copy_stream;
|
||||
extern bool use_mangle_scratch;
|
||||
extern bool use_compress_expand;
|
||||
extern bool use_compress_reset_expand;
|
||||
extern bool use_literal_api;
|
||||
extern int abort_on_failure;
|
||||
extern int no_signal_handler;
|
||||
extern bool force_edit_distance;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -118,6 +118,7 @@ bool use_copy_stream = false;
|
||||
bool use_mangle_scratch = false;
|
||||
bool use_compress_expand = false;
|
||||
bool use_compress_reset_expand = false;
|
||||
bool use_literal_api = false;
|
||||
int abort_on_failure = 0;
|
||||
int no_signal_handler = 0;
|
||||
size_t max_scan_queue_len = 25000;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -106,6 +106,8 @@ bool dump_intermediate = true;
|
||||
bool force_edit_distance = false;
|
||||
u32 edit_distance = 0;
|
||||
|
||||
int use_literal_api = 0;
|
||||
|
||||
} // namespace
|
||||
|
||||
// Usage statement.
|
||||
@@ -139,6 +141,7 @@ void usage(const char *name, const char *error) {
|
||||
printf(" -8 Force UTF8 mode on all patterns.\n");
|
||||
printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
|
||||
printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
|
||||
printf(" --literal-on Use Hyperscan pure literal matching API.\n");
|
||||
printf("\n");
|
||||
printf("Example:\n");
|
||||
printf("$ %s -e pattern.file -s sigfile\n", name);
|
||||
@@ -163,6 +166,7 @@ void processArgs(int argc, char *argv[], Grey &grey) {
|
||||
{"utf8", no_argument, nullptr, '8'},
|
||||
{"prefilter", no_argument, &force_prefilter, 1},
|
||||
{"som-width", required_argument, nullptr, 'd'},
|
||||
{"literal-on", no_argument, &use_literal_api, 1},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
|
||||
@@ -501,9 +505,23 @@ unsigned int dumpDataMulti(const vector<const char *> &patterns,
|
||||
hs_database_t *db = nullptr;
|
||||
hs_compile_error_t *compile_err;
|
||||
|
||||
hs_error_t err = hs_compile_multi_int(
|
||||
patterns.data(), flags.data(), ids.data(), ext.c_array(),
|
||||
patterns.size(), mode, plat_info.get(), &db, &compile_err, grey);
|
||||
hs_error_t err;
|
||||
const size_t count = patterns.size();
|
||||
if (use_literal_api) {
|
||||
// Compute length of each pattern.
|
||||
vector<size_t> lens(count);
|
||||
for (unsigned int i = 0; i < count; i++) {
|
||||
lens[i] = strlen(patterns[i]);
|
||||
}
|
||||
err = hs_compile_lit_multi_int(patterns.data(), flags.data(),
|
||||
ids.data(), ext.c_array(), lens.data(),
|
||||
count, mode, plat_info.get(), &db,
|
||||
&compile_err, grey);
|
||||
} else {
|
||||
err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(),
|
||||
ext.c_array(), count, mode, plat_info.get(),
|
||||
&db, &compile_err, grey);
|
||||
}
|
||||
|
||||
if (err != HS_SUCCESS) {
|
||||
if (compile_err && compile_err->message) {
|
||||
|
||||
Reference in New Issue
Block a user