/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * \file * \brief hscheck: Tool to test regex compilation with Hyperscan. * * hscheck accepts a file of regular expressions in the form: * "ID:/regex/flags" and tests whether they can be compiled with Hyperscan, * reporting the error if compilation fails. * * For example, create the file "regex" containing: * * 1:/foo.*bar/s * 2:/hatstand|teakettle|badgerbrush/ * * This can be checked with the following hscheck invocation: * * $ bin/hscheck -e regex * * Use "hscheck -h" for complete usage information. */ #include "config.h" #include "ExpressionParser.h" #include "expressions.h" #include "string_util.h" #include "util/expression_path.h" #include "util/make_unique.h" #include "grey.h" #include "hs_compile.h" #include "hs_internal.h" #include "ue2common.h" #include #include #include #include #include #include #include #include using namespace std; using namespace ue2; namespace /* anonymous */ { // are we in streaming mode? (default: yes) bool g_streaming = true; bool g_vectored = false; string g_exprPath(""); string g_signatureFile(""); bool g_allSignatures = false; bool g_forceEditDistance = false; bool build_sigs = false; unsigned int g_signature; unsigned int g_editDistance; unsigned int globalFlags = 0; unsigned int num_of_threads = 1; unsigned int countFailures = 0; // Global greybox structure, used in non-release builds. unique_ptr g_grey; // Global expression map. ExpressionMap g_exprMap; // Iterator pointing to next expression to process. ExpressionMap::const_iterator read_it; // Iterator pointing to next expression to print results. ExpressionMap::const_iterator print_it; // Mutex guarding access to read iterator. std::mutex lk_read; // Mutex serialising access to output map and stdout. std::mutex lk_output; // Possible values for pattern check results. enum ExprStatus {NOT_PROCESSED, SUCCESS, FAILURE}; // Map for storing results. map> output; } // namespace static bool getNextExpressionId(ExpressionMap::const_iterator &it) { lock_guard lock(lk_read); if (read_it != g_exprMap.end()) { it = read_it; ++read_it; return true; } else { return false; } } // This function prints the Pattern IDs order // It creates the output for build sigs // Caller is required to hold lk_output when calling this function static void printExpressionId(const ExpressionMap &exprMap) { while (print_it != exprMap.end()) { unsigned int id = print_it->first; const string ®ex = print_it->second; const auto &result = output[id]; if (result.second == NOT_PROCESSED) { break; } bool fail = result.second == FAILURE; if (!build_sigs) { if (fail) { cout << "FAIL (compile): " << id << ":" << regex << ": " << result.first << endl; } else { cout << result.first << ' ' << id << ":" << regex << endl; } } else { if (fail) { cout << "# " << id << " # " << result.first << endl; } else { cout << id << endl; } } ++print_it; } } static void recordFailure(const ExpressionMap &exprMap, unsigned int id, const string &err) { lock_guard lock(lk_output); output[id].first = err; output[id].second = FAILURE; countFailures++; printExpressionId(exprMap); } static void recordSuccess(const ExpressionMap &exprMap, unsigned int id) { lock_guard lock(lk_output); output[id].first = "OK:"; output[id].second = SUCCESS; printExpressionId(exprMap); } static void checkExpression(UNUSED void *threadarg) { unsigned int mode = g_streaming ? HS_MODE_STREAM : g_vectored ? HS_MODE_VECTORED : HS_MODE_BLOCK; if (g_streaming) { // Use SOM mode, for permissiveness' sake. mode |= HS_MODE_SOM_HORIZON_LARGE; } ExpressionMap::const_iterator it; while (getNextExpressionId(it)) { const string &line = it->second; // Initial slash char is required, but unused. if (line.empty() || line[0] != '/') { recordFailure(g_exprMap, it->first, "Format required is \"ID:/REGEX/FLAGS\"."); continue; } // Make a mutable copy and trim any whitespace on the right. string expr = line; boost::trim(expr); size_t flagsStart = expr.find_last_of('/'); if (flagsStart == string::npos || flagsStart == 0) { recordFailure(g_exprMap, it->first, "No trailing '/' char."); continue; } string regex; unsigned int flags = 0; hs_expr_ext ext; if (!readExpression(expr, regex, &flags, &ext)) { recordFailure(g_exprMap, it->first, "Unsupported flag used."); continue; } flags |= globalFlags; if (g_forceEditDistance) { ext.edit_distance = g_editDistance; ext.flags |= HS_EXT_FLAG_EDIT_DISTANCE; } // Try and compile a database. const char *regexp = regex.c_str(); const hs_expr_ext *extp = &ext; hs_error_t err; hs_compile_error_t *compile_err; hs_database_t *db = nullptr; #if !defined(RELEASE_BUILD) // This variant is available in non-release builds and allows us to // modify greybox settings. err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, nullptr, &db, &compile_err, *g_grey); #else err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, nullptr, &db, &compile_err); #endif if (err == HS_SUCCESS) { assert(db); recordSuccess(g_exprMap, it->first); hs_free_database(db); } else { assert(!db); assert(compile_err); recordFailure(g_exprMap, it->first, compile_err->message); hs_free_compile_error(compile_err); } } } static void usage() { cout << "Usage: hscheck [OPTIONS...]" << endl << endl << " -e PATH Path to expression directory." << endl << " -s FILE Signature file to use." << endl << " -z NUM Signature ID to use." << endl << " -E DISTANCE Force edit distance to DISTANCE for all patterns." << endl #ifndef RELEASE_BUILD << " -G OVERRIDES Overrides for the grey." << endl #endif << " -V Operate in vectored mode." << endl << " -N Operate in block mode (default: streaming)." << endl << " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl << " -8 Force UTF8 mode on all patterns." << endl << " -T NUM Run with NUM threads." << endl << " -h Display this help." << endl << " -B Build signature set." << endl << endl; } static void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { const char options[] = "e:E:s:z:hLNV8G:T:B"; bool signatureSet = false; for (;;) { int c = getopt_long(argc, argv, options, nullptr, nullptr); if (c < 0) { break; } switch (c) { case 'e': g_exprPath.assign(optarg); break; case 'h': usage(); exit(0); break; case 's': g_signatureFile.assign(optarg); break; case 'E': if (!fromString(optarg, g_editDistance)) { usage(); exit(1); } g_forceEditDistance = true; break; case 'z': if (!fromString(optarg, g_signature)) { usage(); exit(1); } signatureSet = true; break; case '8': globalFlags |= HS_FLAG_UTF8; break; #ifndef RELEASE_BUILD case 'G': applyGreyOverrides(grey.get(), string(optarg)); break; #endif case 'L': globalFlags |= HS_FLAG_SOM_LEFTMOST; break; case 'N': g_streaming = false; break; case 'V': g_streaming = false; g_vectored = true; break; case 'T': num_of_threads = atoi(optarg); break; case 'B': build_sigs = true; break; default: usage(); exit(1); } } if (g_exprPath.empty() && !g_signatureFile.empty()) { /* attempt to infer an expression directory */ g_exprPath = inferExpressionPath(g_signatureFile); } if (g_exprPath.empty()) { usage(); exit(1); } if (!isDir(g_exprPath) && isFile(g_exprPath) && g_signatureFile.empty() && !signatureSet) { g_allSignatures = true; } if (g_signatureFile.empty() && !signatureSet && !g_allSignatures) { usage(); exit(1); } } static void failLine(unsigned lineNum, const string &file, const string &line, const string &error) { cerr << "Parse error in file " << file << " on line " << lineNum << ": " << error << endl << "Line is: '" << line << "'" << endl; exit(1); } // load a list of signature IDs if Build_sigs is enabled // If a line is commented out, this function still loads the corresponding ID. // The commented out line should have the format #id# // It then prints out a signature file with the IDs that compile successfully. static void loadSignatureBuildSigs(const string &inFile, SignatureSet &signatures) { ifstream f(inFile.c_str()); if (!f.good()) { cerr << "Can't open file: '" << inFile << "'" << endl; exit(1); } unsigned lineNum = 0; string line; while (getline(f, line)) { lineNum++; unsigned id; // if line is empty, we can skip it if (line.empty()) { continue; } // if line is commented out, try to locate the ID // Line is usually in the form #id# if (line[0] == '#') { string temp; // skip the opening hash and see if there is a second size_t comment = line.find_first_of('#', 1); if (comment) { temp = line.substr(1, comment - 1); } else { temp = line.substr(1, line.size()); } // cull any whitespace boost::trim(temp); if (fromString(temp, id)) { signatures.push_back(id); } else { // couldn't be turned into an ID, dump to stdout cout << line << endl; } } else { // lines that don't begin with # if (fromString(line, id)) { signatures.push_back(id); } else { // Parse error occurred failLine(lineNum, inFile, line, "Unable to parse ID."); } } } } int main(int argc, char **argv) { num_of_threads = max(1u, std::thread::hardware_concurrency()); #if !defined(RELEASE_BUILD) g_grey = make_unique(); #endif processArgs(argc, argv, g_grey); if (num_of_threads == 0) { cout << "Error: Must have at least one thread." << endl; exit(1); } loadExpressions(g_exprPath, g_exprMap); if (!g_allSignatures) { SignatureSet signatures; if (!g_signatureFile.empty()) { if (!build_sigs) { loadSignatureList(g_signatureFile, signatures); } else { loadSignatureBuildSigs(g_signatureFile, signatures); } } else { signatures.push_back(g_signature); } g_exprMap = limitToSignatures(g_exprMap, signatures); } if (g_exprMap.empty()) { cout << "Warning: no signatures to scan. Exiting." << endl; exit(0); } read_it = g_exprMap.begin(); print_it = g_exprMap.begin(); vector threads(num_of_threads); for (unsigned int i = 0; i < num_of_threads; i++) { threads[i] = thread(checkExpression, nullptr); } for (unsigned int i = 0; i < num_of_threads; i++) { threads[i].join(); } if (!g_exprMap.empty() && !build_sigs) { cout << "SUMMARY: " << countFailures << " of " << g_exprMap.size() << " failed." << endl; } return 0; }