diff --git a/tools/hscheck/CMakeLists.txt b/tools/hscheck/CMakeLists.txt new file mode 100644 index 00000000..065d4c04 --- /dev/null +++ b/tools/hscheck/CMakeLists.txt @@ -0,0 +1,10 @@ +# only set these after all tests are done +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") + +SET(hscheck_SOURCES + main.cpp +) +add_executable(hscheck ${hscheck_SOURCES}) +target_link_libraries(hscheck hs expressionutil pthread) + diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp new file mode 100644 index 00000000..59f80244 --- /dev/null +++ b/tools/hscheck/main.cpp @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief hscheck: Tool to test regex compilation with Hyperscan. + * + * hscheck accepts a file of regular expressions in the form: + * "ID:/regex/flags" and tests whether they can be compiled with Hyperscan, + * reporting the error if compilation fails. + * + * For example, create the file "regex" containing: + * + * 1:/foo.*bar/s + * 2:/hatstand|teakettle|badgerbrush/ + * + * This can be checked with the following hscheck invocation: + * + * $ bin/hscheck -e regex + * + * Use "hscheck -h" for complete usage information. + */ + +#include "config.h" + +#include "ExpressionParser.h" +#include "expressions.h" +#include "string_util.h" +#include "util/expression_path.h" +#include "util/make_unique.h" + +#include "grey.h" +#include "hs_compile.h" +#include "hs_internal.h" +#include "ue2common.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace std; +using namespace ue2; + +namespace /* anonymous */ { + +// are we in streaming mode? (default: yes) +bool g_streaming = true; +bool g_vectored = false; +string g_exprPath(""); +string g_signatureFile(""); +bool g_allSignatures = false; +bool g_forceEditDistance = false; +bool build_sigs = false; +unsigned int g_signature; +unsigned int g_editDistance; +unsigned int globalFlags = 0; +unsigned int num_of_threads = 1; +unsigned int countFailures = 0; + +// Global greybox structure, used in non-release builds. +unique_ptr g_grey; + +// Global expression map. +ExpressionMap g_exprMap; + +// Iterator pointing to next expression to process. +ExpressionMap::const_iterator read_it; + +// Iterator pointing to next expression to print results. +ExpressionMap::const_iterator print_it; + +// Mutex guarding access to read iterator. +std::mutex lk_read; + +// Mutex serialising access to output map and stdout. +std::mutex lk_output; + +// Possible values for pattern check results. +enum ExprStatus {NOT_PROCESSED, SUCCESS, FAILURE}; + +// Map for storing results. +map> output; + +} // namespace + +static +bool getNextExpressionId(ExpressionMap::const_iterator &it) { + lock_guard lock(lk_read); + if (read_it != g_exprMap.end()) { + it = read_it; + ++read_it; + return true; + } else { + return false; + } +} + +// This function prints the Pattern IDs order +// It creates the output for build sigs +// Caller is required to hold lk_output when calling this function +static +void printExpressionId(const ExpressionMap &exprMap) { + while (print_it != exprMap.end()) { + unsigned int id = print_it->first; + const string ®ex = print_it->second; + const auto &result = output[id]; + if (result.second == NOT_PROCESSED) { + break; + } + bool fail = result.second == FAILURE; + if (!build_sigs) { + if (fail) { + cout << "FAIL (compile): " << id << ":" << regex << ": " + << result.first << endl; + } else { + cout << result.first << ' ' << id << ":" << regex << endl; + } + } else { + if (fail) { + cout << "# " << id << " # " << result.first << endl; + } else { + cout << id << endl; + } + } + + ++print_it; + } +} + +static +void recordFailure(const ExpressionMap &exprMap, unsigned int id, + const string &err) { + lock_guard lock(lk_output); + output[id].first = err; + output[id].second = FAILURE; + countFailures++; + printExpressionId(exprMap); +} + +static +void recordSuccess(const ExpressionMap &exprMap, unsigned int id) { + lock_guard lock(lk_output); + output[id].first = "OK:"; + output[id].second = SUCCESS; + printExpressionId(exprMap); +} + +static +void checkExpression(UNUSED void *threadarg) { + unsigned int mode = g_streaming ? HS_MODE_STREAM + : g_vectored ? HS_MODE_VECTORED + : HS_MODE_BLOCK; + if (g_streaming) { + // Use SOM mode, for permissiveness' sake. + mode |= HS_MODE_SOM_HORIZON_LARGE; + } + + ExpressionMap::const_iterator it; + while (getNextExpressionId(it)) { + const string &line = it->second; + + // Initial slash char is required, but unused. + if (line.empty() || line[0] != '/') { + recordFailure(g_exprMap, it->first, + "Format required is \"ID:/REGEX/FLAGS\"."); + continue; + } + + // Make a mutable copy and trim any whitespace on the right. + string expr = line; + boost::trim(expr); + + size_t flagsStart = expr.find_last_of('/'); + if (flagsStart == string::npos || flagsStart == 0) { + recordFailure(g_exprMap, it->first, "No trailing '/' char."); + continue; + } + + string regex; + unsigned int flags = 0; + hs_expr_ext ext; + if (!readExpression(expr, regex, &flags, &ext)) { + recordFailure(g_exprMap, it->first, "Unsupported flag used."); + continue; + } + + flags |= globalFlags; + if (g_forceEditDistance) { + ext.edit_distance = g_editDistance; + ext.flags |= HS_EXT_FLAG_EDIT_DISTANCE; + } + + // Try and compile a database. + const char *regexp = regex.c_str(); + const hs_expr_ext *extp = &ext; + + hs_error_t err; + hs_compile_error_t *compile_err; + hs_database_t *db = nullptr; + +#if !defined(RELEASE_BUILD) + // This variant is available in non-release builds and allows us to + // modify greybox settings. + err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err, *g_grey); +#else + err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err); +#endif + + if (err == HS_SUCCESS) { + assert(db); + recordSuccess(g_exprMap, it->first); + hs_free_database(db); + } else { + assert(!db); + assert(compile_err); + recordFailure(g_exprMap, it->first, compile_err->message); + hs_free_compile_error(compile_err); + } + } +} + +static +void usage() { + cout << "Usage: hscheck [OPTIONS...]" << endl << endl + << " -e PATH Path to expression directory." << endl + << " -s FILE Signature file to use." << endl + << " -z NUM Signature ID to use." << endl + << " -E DISTANCE Force edit distance to DISTANCE for all patterns." << endl +#ifndef RELEASE_BUILD + << " -G OVERRIDES Overrides for the grey." << endl +#endif + << " -V Operate in vectored mode." << endl + << " -N Operate in block mode (default: streaming)." << endl + << " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl + << " -8 Force UTF8 mode on all patterns." << endl + << " -T NUM Run with NUM threads." << endl + << " -h Display this help." << endl + << " -B Build signature set." << endl + << endl; +} + +static +void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { + const char options[] = "e:E:s:z:hLNV8G:T:B"; + bool signatureSet = false; + + for (;;) { + int c = getopt_long(argc, argv, options, nullptr, nullptr); + if (c < 0) { + break; + } + switch (c) { + case 'e': + g_exprPath.assign(optarg); + break; + case 'h': + usage(); + exit(0); + break; + case 's': + g_signatureFile.assign(optarg); + break; + case 'E': + if (!fromString(optarg, g_editDistance)) { + usage(); + exit(1); + } + g_forceEditDistance = true; + break; + case 'z': + if (!fromString(optarg, g_signature)) { + usage(); + exit(1); + } + signatureSet = true; + break; + case '8': + globalFlags |= HS_FLAG_UTF8; + break; + +#ifndef RELEASE_BUILD + case 'G': + applyGreyOverrides(grey.get(), string(optarg)); + break; +#endif + case 'L': + globalFlags |= HS_FLAG_SOM_LEFTMOST; + break; + case 'N': + g_streaming = false; + break; + case 'V': + g_streaming = false; + g_vectored = true; + break; + case 'T': + num_of_threads = atoi(optarg); + break; + case 'B': + build_sigs = true; + break; + default: + usage(); + exit(1); + } + } + + if (g_exprPath.empty() && !g_signatureFile.empty()) { + /* attempt to infer an expression directory */ + g_exprPath = inferExpressionPath(g_signatureFile); + } + + if (g_exprPath.empty()) { + usage(); + exit(1); + } + + if (!isDir(g_exprPath) && isFile(g_exprPath) + && g_signatureFile.empty() && !signatureSet) { + g_allSignatures = true; + } + + if (g_signatureFile.empty() && !signatureSet && !g_allSignatures) { + usage(); + exit(1); + } +} + +static +void failLine(unsigned lineNum, const string &file, + const string &line, const string &error) { + cerr << "Parse error in file " << file + << " on line " << lineNum << ": " << error + << endl << "Line is: '" << line << "'" << endl; + exit(1); +} + +// load a list of signature IDs if Build_sigs is enabled +// If a line is commented out, this function still loads the corresponding ID. +// The commented out line should have the format #id# +// It then prints out a signature file with the IDs that compile successfully. +static +void loadSignatureBuildSigs(const string &inFile, + SignatureSet &signatures) { + ifstream f(inFile.c_str()); + if (!f.good()) { + cerr << "Can't open file: '" << inFile << "'" << endl; + exit(1); + } + + unsigned lineNum = 0; + string line; + while (getline(f, line)) { + lineNum++; + unsigned id; + // if line is empty, we can skip it + if (line.empty()) { + continue; + } + // if line is commented out, try to locate the ID + // Line is usually in the form #id# + if (line[0] == '#') { + string temp; + // skip the opening hash and see if there is a second + size_t comment = line.find_first_of('#', 1); + if (comment) { + temp = line.substr(1, comment - 1); + } else { + temp = line.substr(1, line.size()); + } + // cull any whitespace + boost::trim(temp); + + if (fromString(temp, id)) { + signatures.push_back(id); + } else { + // couldn't be turned into an ID, dump to stdout + cout << line << endl; + } + } else { // lines that don't begin with # + if (fromString(line, id)) { + signatures.push_back(id); + } else { + // Parse error occurred + failLine(lineNum, inFile, line, "Unable to parse ID."); + } + } + } +} + +int main(int argc, char **argv) { + num_of_threads = max(1u, std::thread::hardware_concurrency()); + +#if !defined(RELEASE_BUILD) + g_grey = make_unique(); +#endif + processArgs(argc, argv, g_grey); + + if (num_of_threads == 0) { + cout << "Error: Must have at least one thread." << endl; + exit(1); + } + + loadExpressions(g_exprPath, g_exprMap); + + if (!g_allSignatures) { + SignatureSet signatures; + if (!g_signatureFile.empty()) { + if (!build_sigs) { + loadSignatureList(g_signatureFile, signatures); + } else { + loadSignatureBuildSigs(g_signatureFile, signatures); + } + } else { + signatures.push_back(g_signature); + } + + g_exprMap = limitToSignatures(g_exprMap, signatures); + } + + if (g_exprMap.empty()) { + cout << "Warning: no signatures to scan. Exiting." << endl; + exit(0); + } + + read_it = g_exprMap.begin(); + print_it = g_exprMap.begin(); + vector threads(num_of_threads); + + for (unsigned int i = 0; i < num_of_threads; i++) { + threads[i] = thread(checkExpression, nullptr); + } + + for (unsigned int i = 0; i < num_of_threads; i++) { + threads[i].join(); + } + + if (!g_exprMap.empty() && !build_sigs) { + cout << "SUMMARY: " << countFailures << " of " + << g_exprMap.size() << " failed." << endl; + } + return 0; +}