hscheck: tool to test regex compilation

hscheck accepts a file of regular expressions in the form:

    ID:/regex/flags

and tests whether they can be compiled with Hyperscan, reporting the
error if compilation fails.
This commit is contained in:
Justin Viiret 2017-12-07 14:07:48 +11:00 committed by Xiang Wang
parent 1330265ced
commit 720ec745c1
2 changed files with 486 additions and 0 deletions

View File

@ -0,0 +1,10 @@
# only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
SET(hscheck_SOURCES
main.cpp
)
add_executable(hscheck ${hscheck_SOURCES})
target_link_libraries(hscheck hs expressionutil pthread)

476
tools/hscheck/main.cpp Normal file
View File

@ -0,0 +1,476 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* \file
* \brief hscheck: Tool to test regex compilation with Hyperscan.
*
* hscheck accepts a file of regular expressions in the form:
* "ID:/regex/flags" and tests whether they can be compiled with Hyperscan,
* reporting the error if compilation fails.
*
* For example, create the file "regex" containing:
*
* 1:/foo.*bar/s
* 2:/hatstand|teakettle|badgerbrush/
*
* This can be checked with the following hscheck invocation:
*
* $ bin/hscheck -e regex
*
* Use "hscheck -h" for complete usage information.
*/
#include "config.h"
#include "ExpressionParser.h"
#include "expressions.h"
#include "string_util.h"
#include "util/expression_path.h"
#include "util/make_unique.h"
#include "grey.h"
#include "hs_compile.h"
#include "hs_internal.h"
#include "ue2common.h"
#include <cassert>
#include <fstream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <thread>
#include <getopt.h>
#include <boost/algorithm/string/trim.hpp>
using namespace std;
using namespace ue2;
namespace /* anonymous */ {
// are we in streaming mode? (default: yes)
bool g_streaming = true;
bool g_vectored = false;
string g_exprPath("");
string g_signatureFile("");
bool g_allSignatures = false;
bool g_forceEditDistance = false;
bool build_sigs = false;
unsigned int g_signature;
unsigned int g_editDistance;
unsigned int globalFlags = 0;
unsigned int num_of_threads = 1;
unsigned int countFailures = 0;
// Global greybox structure, used in non-release builds.
unique_ptr<Grey> g_grey;
// Global expression map.
ExpressionMap g_exprMap;
// Iterator pointing to next expression to process.
ExpressionMap::const_iterator read_it;
// Iterator pointing to next expression to print results.
ExpressionMap::const_iterator print_it;
// Mutex guarding access to read iterator.
std::mutex lk_read;
// Mutex serialising access to output map and stdout.
std::mutex lk_output;
// Possible values for pattern check results.
enum ExprStatus {NOT_PROCESSED, SUCCESS, FAILURE};
// Map for storing results.
map<unsigned int, pair<string, ExprStatus>> output;
} // namespace
static
bool getNextExpressionId(ExpressionMap::const_iterator &it) {
lock_guard<mutex> lock(lk_read);
if (read_it != g_exprMap.end()) {
it = read_it;
++read_it;
return true;
} else {
return false;
}
}
// This function prints the Pattern IDs order
// It creates the output for build sigs
// Caller is required to hold lk_output when calling this function
static
void printExpressionId(const ExpressionMap &exprMap) {
while (print_it != exprMap.end()) {
unsigned int id = print_it->first;
const string &regex = print_it->second;
const auto &result = output[id];
if (result.second == NOT_PROCESSED) {
break;
}
bool fail = result.second == FAILURE;
if (!build_sigs) {
if (fail) {
cout << "FAIL (compile): " << id << ":" << regex << ": "
<< result.first << endl;
} else {
cout << result.first << ' ' << id << ":" << regex << endl;
}
} else {
if (fail) {
cout << "# " << id << " # " << result.first << endl;
} else {
cout << id << endl;
}
}
++print_it;
}
}
static
void recordFailure(const ExpressionMap &exprMap, unsigned int id,
const string &err) {
lock_guard<mutex> lock(lk_output);
output[id].first = err;
output[id].second = FAILURE;
countFailures++;
printExpressionId(exprMap);
}
static
void recordSuccess(const ExpressionMap &exprMap, unsigned int id) {
lock_guard<mutex> lock(lk_output);
output[id].first = "OK:";
output[id].second = SUCCESS;
printExpressionId(exprMap);
}
static
void checkExpression(UNUSED void *threadarg) {
unsigned int mode = g_streaming ? HS_MODE_STREAM
: g_vectored ? HS_MODE_VECTORED
: HS_MODE_BLOCK;
if (g_streaming) {
// Use SOM mode, for permissiveness' sake.
mode |= HS_MODE_SOM_HORIZON_LARGE;
}
ExpressionMap::const_iterator it;
while (getNextExpressionId(it)) {
const string &line = it->second;
// Initial slash char is required, but unused.
if (line.empty() || line[0] != '/') {
recordFailure(g_exprMap, it->first,
"Format required is \"ID:/REGEX/FLAGS\".");
continue;
}
// Make a mutable copy and trim any whitespace on the right.
string expr = line;
boost::trim(expr);
size_t flagsStart = expr.find_last_of('/');
if (flagsStart == string::npos || flagsStart == 0) {
recordFailure(g_exprMap, it->first, "No trailing '/' char.");
continue;
}
string regex;
unsigned int flags = 0;
hs_expr_ext ext;
if (!readExpression(expr, regex, &flags, &ext)) {
recordFailure(g_exprMap, it->first, "Unsupported flag used.");
continue;
}
flags |= globalFlags;
if (g_forceEditDistance) {
ext.edit_distance = g_editDistance;
ext.flags |= HS_EXT_FLAG_EDIT_DISTANCE;
}
// Try and compile a database.
const char *regexp = regex.c_str();
const hs_expr_ext *extp = &ext;
hs_error_t err;
hs_compile_error_t *compile_err;
hs_database_t *db = nullptr;
#if !defined(RELEASE_BUILD)
// This variant is available in non-release builds and allows us to
// modify greybox settings.
err = hs_compile_multi_int(&regexp, &flags, nullptr, &extp, 1, mode,
nullptr, &db, &compile_err, *g_grey);
#else
err = hs_compile_ext_multi(&regexp, &flags, nullptr, &extp, 1, mode,
nullptr, &db, &compile_err);
#endif
if (err == HS_SUCCESS) {
assert(db);
recordSuccess(g_exprMap, it->first);
hs_free_database(db);
} else {
assert(!db);
assert(compile_err);
recordFailure(g_exprMap, it->first, compile_err->message);
hs_free_compile_error(compile_err);
}
}
}
static
void usage() {
cout << "Usage: hscheck [OPTIONS...]" << endl << endl
<< " -e PATH Path to expression directory." << endl
<< " -s FILE Signature file to use." << endl
<< " -z NUM Signature ID to use." << endl
<< " -E DISTANCE Force edit distance to DISTANCE for all patterns." << endl
#ifndef RELEASE_BUILD
<< " -G OVERRIDES Overrides for the grey." << endl
#endif
<< " -V Operate in vectored mode." << endl
<< " -N Operate in block mode (default: streaming)." << endl
<< " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl
<< " -8 Force UTF8 mode on all patterns." << endl
<< " -T NUM Run with NUM threads." << endl
<< " -h Display this help." << endl
<< " -B Build signature set." << endl
<< endl;
}
static
void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
const char options[] = "e:E:s:z:hLNV8G:T:B";
bool signatureSet = false;
for (;;) {
int c = getopt_long(argc, argv, options, nullptr, nullptr);
if (c < 0) {
break;
}
switch (c) {
case 'e':
g_exprPath.assign(optarg);
break;
case 'h':
usage();
exit(0);
break;
case 's':
g_signatureFile.assign(optarg);
break;
case 'E':
if (!fromString(optarg, g_editDistance)) {
usage();
exit(1);
}
g_forceEditDistance = true;
break;
case 'z':
if (!fromString(optarg, g_signature)) {
usage();
exit(1);
}
signatureSet = true;
break;
case '8':
globalFlags |= HS_FLAG_UTF8;
break;
#ifndef RELEASE_BUILD
case 'G':
applyGreyOverrides(grey.get(), string(optarg));
break;
#endif
case 'L':
globalFlags |= HS_FLAG_SOM_LEFTMOST;
break;
case 'N':
g_streaming = false;
break;
case 'V':
g_streaming = false;
g_vectored = true;
break;
case 'T':
num_of_threads = atoi(optarg);
break;
case 'B':
build_sigs = true;
break;
default:
usage();
exit(1);
}
}
if (g_exprPath.empty() && !g_signatureFile.empty()) {
/* attempt to infer an expression directory */
g_exprPath = inferExpressionPath(g_signatureFile);
}
if (g_exprPath.empty()) {
usage();
exit(1);
}
if (!isDir(g_exprPath) && isFile(g_exprPath)
&& g_signatureFile.empty() && !signatureSet) {
g_allSignatures = true;
}
if (g_signatureFile.empty() && !signatureSet && !g_allSignatures) {
usage();
exit(1);
}
}
static
void failLine(unsigned lineNum, const string &file,
const string &line, const string &error) {
cerr << "Parse error in file " << file
<< " on line " << lineNum << ": " << error
<< endl << "Line is: '" << line << "'" << endl;
exit(1);
}
// load a list of signature IDs if Build_sigs is enabled
// If a line is commented out, this function still loads the corresponding ID.
// The commented out line should have the format #<space>id<space>#<comment>
// It then prints out a signature file with the IDs that compile successfully.
static
void loadSignatureBuildSigs(const string &inFile,
SignatureSet &signatures) {
ifstream f(inFile.c_str());
if (!f.good()) {
cerr << "Can't open file: '" << inFile << "'" << endl;
exit(1);
}
unsigned lineNum = 0;
string line;
while (getline(f, line)) {
lineNum++;
unsigned id;
// if line is empty, we can skip it
if (line.empty()) {
continue;
}
// if line is commented out, try to locate the ID
// Line is usually in the form #<space>id<space>#<comment>
if (line[0] == '#') {
string temp;
// skip the opening hash and see if there is a second
size_t comment = line.find_first_of('#', 1);
if (comment) {
temp = line.substr(1, comment - 1);
} else {
temp = line.substr(1, line.size());
}
// cull any whitespace
boost::trim(temp);
if (fromString(temp, id)) {
signatures.push_back(id);
} else {
// couldn't be turned into an ID, dump to stdout
cout << line << endl;
}
} else { // lines that don't begin with #
if (fromString(line, id)) {
signatures.push_back(id);
} else {
// Parse error occurred
failLine(lineNum, inFile, line, "Unable to parse ID.");
}
}
}
}
int main(int argc, char **argv) {
num_of_threads = max(1u, std::thread::hardware_concurrency());
#if !defined(RELEASE_BUILD)
g_grey = make_unique<Grey>();
#endif
processArgs(argc, argv, g_grey);
if (num_of_threads == 0) {
cout << "Error: Must have at least one thread." << endl;
exit(1);
}
loadExpressions(g_exprPath, g_exprMap);
if (!g_allSignatures) {
SignatureSet signatures;
if (!g_signatureFile.empty()) {
if (!build_sigs) {
loadSignatureList(g_signatureFile, signatures);
} else {
loadSignatureBuildSigs(g_signatureFile, signatures);
}
} else {
signatures.push_back(g_signature);
}
g_exprMap = limitToSignatures(g_exprMap, signatures);
}
if (g_exprMap.empty()) {
cout << "Warning: no signatures to scan. Exiting." << endl;
exit(0);
}
read_it = g_exprMap.begin();
print_it = g_exprMap.begin();
vector<thread> threads(num_of_threads);
for (unsigned int i = 0; i < num_of_threads; i++) {
threads[i] = thread(checkExpression, nullptr);
}
for (unsigned int i = 0; i < num_of_threads; i++) {
threads[i].join();
}
if (!g_exprMap.empty() && !build_sigs) {
cout << "SUMMARY: " << countFailures << " of "
<< g_exprMap.size() << " failed." << endl;
}
return 0;
}