diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index a34eadd0..3382ff42 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -56,11 +56,13 @@ #include "parser/unsupported.h" #include "parser/utf8_validate.h" #include "rose/rose_build.h" +#include "rose/rose_internal.h" #include "som/slot_manager_dump.h" #include "util/bytecode_ptr.h" #include "util/compile_error.h" #include "util/target_info.h" #include "util/verify_types.h" +#include "util/ue2string.h" #include #include @@ -107,6 +109,46 @@ void validateExt(const hs_expr_ext &ext) { } +void ParsedLitExpression::parseLiteral(const char *expression, size_t len, + bool nocase) { + const char *c = expression; + for (size_t i = 0; i < len; i++) { + lit.push_back(*c, nocase); + c++; + } +} + +ParsedLitExpression::ParsedLitExpression(unsigned index_in, + const char *expression, + size_t expLength, unsigned flags, + ReportID report) + : expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false, + SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) { + // For pure literal expression, below 'HS_FLAG_'s are unuseful: + // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET + + if (flags & ~HS_FLAG_ALL) { + DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags); + throw CompileError("Unrecognised flag."); + } + + // FIXME: we disallow highlander + SOM, see UE-1850. + if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) { + throw CompileError("HS_FLAG_SINGLEMATCH is not supported in " + "combination with HS_FLAG_SOM_LEFTMOST."); + } + + // Set SOM type. + if (flags & HS_FLAG_SOM_LEFTMOST) { + expr.som = SOM_LEFT; + } + + // Transfer expression text into ue2_literal. + bool nocase = flags & HS_FLAG_CASELESS ? true : false; + parseLiteral(expression, expLength, nocase); + +} + ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, unsigned flags, ReportID report, const hs_expr_ext *ext) @@ -345,6 +387,49 @@ void addExpression(NG &ng, unsigned index, const char *expression, } } +void addLitExpression(NG &ng, unsigned index, const char *expression, + unsigned flags, const hs_expr_ext *ext, ReportID id, + size_t expLength) { + assert(expression); + const CompileContext &cc = ng.cc; + DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s', len='%zu'\n", index, + id, flags, expression, expLength); + + // Extended parameters are not supported for pure literal patterns. + if (ext && ext->flags != 0LLU) { + throw CompileError("Extended parameters are not supported for pure " + "literal matching API."); + } + + // Ensure that our pattern isn't too long (in characters). + if (strlen(expression) > cc.grey.limitPatternLength) { + throw CompileError("Pattern length exceeds limit."); + } + + // filter out flags not supported by pure literal API. + u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 | + HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION | + HS_FLAG_QUIET; + + if (flags & not_supported) { + throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, " + "HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are " + "supported in literal API."); + } + + // This expression must be a pure literal, we can build ue2_literal + // directly based on expression text. + ParsedLitExpression ple(index, expression, expLength, flags, id); + + // Feed the ue2_literal into Rose. + const auto &expr = ple.expr; + if (ng.addLiteral(ple.lit, expr.index, expr.report, expr.highlander, + expr.som, expr.quiet)) { + DEBUG_PRINTF("took pure literal\n"); + return; + } +} + static bytecode_ptr generateRoseEngine(NG &ng) { const u32 minWidth = @@ -416,10 +501,13 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) { } -struct hs_database *build(NG &ng, unsigned int *length) { +struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) { assert(length); auto rose = generateRoseEngine(ng); + struct RoseEngine *roseHead = rose.get(); + roseHead->pureLiteral = pureFlag; + if (!rose) { throw CompileError("Unable to generate bytecode."); } diff --git a/src/compiler/compiler.h b/src/compiler/compiler.h index 60d7ca33..b42cb142 100644 --- a/src/compiler/compiler.h +++ b/src/compiler/compiler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include "compiler/expression_info.h" #include "parser/Component.h" #include "util/noncopyable.h" +#include "util/ue2string.h" #include @@ -66,6 +67,22 @@ public: std::unique_ptr component; }; + +/** \brief Class gathering together the pieces of a parsed lit-expression. */ +class ParsedLitExpression : noncopyable { +public: + ParsedLitExpression(unsigned index, const char *expression, + size_t expLength, unsigned flags, ReportID report); + + void parseLiteral(const char *expression, size_t len, bool nocase); + + /** \brief Expression information (from flags, extparam etc) */ + ExpressionInfo expr; + + /** \brief Format the lit-expression text into Hyperscan literal type. */ + ue2_literal lit; +}; + /** * \brief Class gathering together the pieces of an expression that has been * built into an NFA graph. @@ -99,6 +116,10 @@ struct BuiltExpression { void addExpression(NG &ng, unsigned index, const char *expression, unsigned flags, const hs_expr_ext *ext, ReportID report); +void addLitExpression(NG &ng, unsigned index, const char *expression, + unsigned flags, const hs_expr_ext *ext, ReportID id, + size_t expLength); + /** * Build a Hyperscan database out of the expressions we've been given. A * fatal error will result in an exception being thrown. @@ -107,11 +128,13 @@ void addExpression(NG &ng, unsigned index, const char *expression, * The global NG object. * @param[out] length * The number of bytes occupied by the compiled structure. + * @param pureFlag + * The flag indicating invocation from literal API or not. * @return * The compiled structure. Should be deallocated with the * hs_database_free() function. */ -struct hs_database *build(NG &ng, unsigned int *length); +struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag); /** * Constructs an NFA graph from the given expression tree. diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index 39cbc335..fcfc0863 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -807,9 +807,6 @@ void findIncludedLits(vector &lits, for (size_t i = 0; i < cnt; i++) { u32 bucket1 = group[i].first; u32 id1 = group[i].second; - if (lits[id1].pure) { - continue; - } buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map, exception_map); } diff --git a/src/fdr/fdr_confirm.h b/src/fdr/fdr_confirm.h index 9490df43..a23082cc 100644 --- a/src/fdr/fdr_confirm.h +++ b/src/fdr/fdr_confirm.h @@ -62,7 +62,6 @@ struct LitInfo { u8 size; u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above. u8 next; - u8 pure; //!< The pass-on of pure flag from hwlmLiteral. }; #define FDRC_FLAG_NO_CONFIRM 1 diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp index 3eab21b2..8e369089 100644 --- a/src/fdr/fdr_confirm_compile.cpp +++ b/src/fdr/fdr_confirm_compile.cpp @@ -87,7 +87,6 @@ void fillLitInfo(const vector &lits, vector &tmpLitInfo, info.flags = flags; info.size = verify_u8(max(lit.msk.size(), lit.s.size())); info.groups = lit.groups; - info.pure = lit.pure; // these are built up assuming a LE machine CONF_TYPE msk = all_ones; diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h index 67e0d692..5a216495 100644 --- a/src/fdr/fdr_confirm_runtime.h +++ b/src/fdr/fdr_confirm_runtime.h @@ -65,7 +65,6 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a u8 oldNext; // initialized in loop do { assert(ISALIGNED(li)); - scratch->pure = li->pure; if (unlikely((conf_key & li->msk) != li->v)) { goto out; @@ -100,7 +99,6 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a li++; } while (oldNext); scratch->fdr_conf = NULL; - scratch->pure = 0; } #endif diff --git a/src/hs.cpp b/src/hs.cpp index 329702d4..ab54105c 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -251,7 +251,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags, ng.rm.logicalKeyRenumber(); unsigned length = 0; - struct hs_database *out = build(ng, &length); + struct hs_database *out = build(ng, &length, 0); assert(out); // should have thrown exception on error assert(length); @@ -281,6 +281,130 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags, } } +hs_error_t +hs_compile_lit_multi_int(const char *const *expressions, const unsigned *flags, + const unsigned *ids, const hs_expr_ext *const *ext, + const size_t *lens, unsigned elements, unsigned mode, + const hs_platform_info_t *platform, hs_database_t **db, + hs_compile_error_t **comp_error, const Grey &g) { + // Check the args: note that it's OK for flags, ids or ext to be null. + if (!comp_error) { + if (db) { + *db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return HS_COMPILER_ERROR; + } + if (!db) { + *comp_error = generateCompileError("Invalid parameter: db is NULL", -1); + return HS_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error + = generateCompileError("Invalid parameter: expressions is NULL", + -1); + return HS_COMPILER_ERROR; + } + if (!lens) { + *db = nullptr; + *comp_error = generateCompileError("Invalid parameter: len is NULL", -1); + return HS_COMPILER_ERROR; + } + if (elements == 0) { + *db = nullptr; + *comp_error = generateCompileError("Invalid parameter: elements is zero", -1); + return HS_COMPILER_ERROR; + } + +#if defined(FAT_RUNTIME) + if (!check_ssse3()) { + *db = nullptr; + *comp_error = generateCompileError("Unsupported architecture", -1); + return HS_ARCH_ERROR; + } +#endif + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode. + return HS_COMPILER_ERROR; + } + + if (!checkPlatform(platform, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkPlattform. + return HS_COMPILER_ERROR; + } + + if (elements > g.limitPatternCount) { + *db = nullptr; + *comp_error = generateCompileError("Number of patterns too large", -1); + return HS_COMPILER_ERROR; + } + + // This function is simply a wrapper around both the parser and compiler + bool isStreaming = mode & (HS_MODE_STREAM | HS_MODE_VECTORED); + bool isVectored = mode & HS_MODE_VECTORED; + unsigned somPrecision = getSomPrecision(mode); + + target_t target_info = platform ? target_t(*platform) + : get_current_target(); + + try { + CompileContext cc(isStreaming, isVectored, target_info, g); + NG ng(cc, elements, somPrecision); + + for (unsigned int i = 0; i < elements; i++) { + // Add this expression to the compiler + try { + addLitExpression(ng, i, expressions[i], flags ? flags[i] : 0, + ext ? ext[i] : nullptr, ids ? ids[i] : 0, + lens[i]); + } catch (CompileError &e) { + /* Caught a parse error; + * throw it upstream as a CompileError with a specific index */ + e.setExpressionIndex(i); + throw; /* do not slice */ + } + } + + // Check sub-expression ids + ng.rm.pl.validateSubIDs(ids, expressions, flags, elements); + // Renumber and assign lkey to reports + ng.rm.logicalKeyRenumber(); + + unsigned length = 0; + struct hs_database *out = build(ng, &length, 1); + + assert(out); //should have thrown exception on error + assert(length); + + *db = out; + *comp_error = nullptr; + + return HS_SUCCESS; + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateCompileError(e.reason, + e.hasIndex ? (int)e.index : -1); + return HS_COMPILER_ERROR; + } + catch (const std::bad_alloc &) { + *db = nullptr; + *comp_error = const_cast(&hs_enomem); + return HS_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal errror, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&hs_einternal); + return HS_COMPILER_ERROR; + } +} + } // namespace ue2 extern "C" HS_PUBLIC_API @@ -326,6 +450,41 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char * const *expressions, platform, db, error, Grey()); } +extern "C" HS_PUBLIC_API +hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags, + const size_t len, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error) { + if (expression == nullptr) { + *db = nullptr; + *error = generateCompileError("Invalid parameter: expression is NULL", + -1); + return HS_COMPILER_ERROR; + } + + unsigned id = 0; // single expressions get zero as an ID + const hs_expr_ext * const *ext = nullptr; // unused for this call. + + return hs_compile_lit_multi_int(&expression, &flags, &id, ext, &len, 1, + mode, platform, db, error, Grey()); +} + +extern "C" HS_PUBLIC_API +hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions, + const unsigned *flags, + const unsigned *ids, + const size_t *lens, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error) { + const hs_expr_ext * const *ext = nullptr; // unused for this call. + return hs_compile_lit_multi_int(expressions, flags, ids, ext, lens, + elements, mode, platform, db, error, + Grey()); +} + static hs_error_t hs_expression_info_int(const char *expression, unsigned int flags, const hs_expr_ext_t *ext, unsigned int mode, diff --git a/src/hs_compile.h b/src/hs_compile.h index c8dcfdf2..4c372ffe 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -323,6 +323,10 @@ typedef struct hs_expr_ext { * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param mode * Compiler mode flags that affect the database as a whole. One of @ref @@ -392,6 +396,10 @@ hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ids * An array of integers specifying the ID number to be associated with the @@ -472,6 +480,10 @@ hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ids * An array of integers specifying the ID number to be associated with the @@ -527,6 +539,165 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions, const hs_platform_info_t *platform, hs_database_t **db, hs_compile_error_t **error); +/** + * The basic pure literal expression compiler. + * + * This is the function call with which a pure literal expression (not a + * common regular expression) is compiled into a Hyperscan database which + * can be passed to the runtime functions (such as @ref hs_scan(), + * @ref hs_open_stream(), etc.) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Compared to @ref hs_compile(), fewer + * valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param len + * The length of the text content of the pure literal expression. As the + * text content indicated by @p expression is treated as single character + * one by one, the special terminating character `\0` should be allowed + * to appear in expression, and not treated as a terminator for a string. + * Thus, the end of a pure literal expression cannot be indicated by + * identifying `\0`, but by counting to the expression length. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags, + const size_t len, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); +/** + * The multiple pure literal expression compiler. + * + * This is the function call with which a set of pure literal expressions is + * compiled into a database which can be passed to the runtime functions (such + * as @ref hs_scan(), @ref hs_open_stream(), etc.) Each expression can be + * labelled with a unique integer which is passed into the match callback to + * identify the pattern that has matched. + * + * @param expressions + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Compared to @ref hs_compile_multi(), fewer valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param lens + * Array of lengths of the text content of each pure literal expression. + * As the text content indicated by @p expression is treated as single + * character one by one, the special terminating character `\0` should be + * allowed to appear in expression, and not treated as a terminator for a + * string. Thus, the end of a pure literal expression cannot be indicated + * by identifying `\0`, but by counting to the expression length. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions, + const unsigned *flags, + const unsigned *ids, + const size_t *lens, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); + /** * Free an error structure generated by @ref hs_compile(), @ref * hs_compile_multi() or @ref hs_compile_ext_multi(). @@ -579,6 +750,10 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error); * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param info * On success, a pointer to the pattern information will be returned in @@ -641,6 +816,10 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ext * A pointer to a filled @ref hs_expr_ext_t structure that defines diff --git a/src/hs_internal.h b/src/hs_internal.h index 2a00fa2f..adf07b22 100644 --- a/src/hs_internal.h +++ b/src/hs_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -52,6 +52,17 @@ hs_error_t hs_compile_multi_int(const char *const *expressions, hs_database_t **db, hs_compile_error_t **comp_error, const Grey &g); +/** \brief Internal use only: takes a Grey argument so that we can use it in + * tools. */ +hs_error_t hs_compile_lit_multi_int(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + const hs_expr_ext *const *ext, + const size_t *lens, unsigned elements, + unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **comp_error, + const Grey &g); } // namespace ue2 extern "C" diff --git a/src/hwlm/hwlm_literal.cpp b/src/hwlm/hwlm_literal.cpp index b257dfb0..692f7c6c 100644 --- a/src/hwlm/hwlm_literal.cpp +++ b/src/hwlm/hwlm_literal.cpp @@ -83,10 +83,9 @@ bool maskIsConsistent(const std::string &s, bool nocase, const vector &msk, * \ref HWLM_MASKLEN. */ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, u32 id_in, hwlm_group_t groups_in, - const vector &msk_in, const vector &cmp_in, - bool pure_in) + const vector &msk_in, const vector &cmp_in) : s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in), - groups(groups_in), msk(msk_in), cmp(cmp_in), pure(pure_in) { + groups(groups_in), msk(msk_in), cmp(cmp_in) { assert(s.size() <= HWLM_LITERAL_MAX_LEN); assert(msk.size() <= HWLM_MASKLEN); assert(msk.size() == cmp.size()); diff --git a/src/hwlm/hwlm_literal.h b/src/hwlm/hwlm_literal.h index 72a57f94..598de814 100644 --- a/src/hwlm/hwlm_literal.h +++ b/src/hwlm/hwlm_literal.h @@ -113,16 +113,13 @@ struct hwlmLiteral { */ std::vector cmp; - bool pure; //!< \brief The pass-on of pure flag from LitFragment. - /** \brief Complete constructor, takes group information and msk/cmp. * * This constructor takes a msk/cmp pair. Both must be vectors of length <= * \ref HWLM_MASKLEN. */ hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, u32 id_in, hwlm_group_t groups_in, - const std::vector &msk_in, const std::vector &cmp_in, - bool pure_in = false); + const std::vector &msk_in, const std::vector &cmp_in); /** \brief Simple constructor: no group information, no msk/cmp. * diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp index d08bab3c..a5d67f30 100644 --- a/src/parser/shortcut_literal.cpp +++ b/src/parser/shortcut_literal.cpp @@ -185,7 +185,6 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { return false; } - vis.lit.set_pure(); const ue2_literal &lit = vis.lit; if (lit.empty()) { diff --git a/src/rose/block.c b/src/rose/block.c index a32113f4..b3f424cb 100644 --- a/src/rose/block.c +++ b/src/rose/block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/src/rose/match.c b/src/rose/match.c index c91b2a50..84d3b1fd 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -238,10 +238,10 @@ hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t, assert(id && id < t->size); // id is an offset into bytecode const u64a som = 0; const u8 flags = 0; - if (!scratch->pure) { - return roseRunProgram(t, scratch, id, som, end, flags); - } else { + if (t->pureLiteral) { return roseRunProgram_l(t, scratch, id, som, end, flags); + } else { + return roseRunProgram(t, scratch, id, som, end, flags); } } @@ -619,8 +619,12 @@ int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) { // Our match ID is the program offset. const u32 program = id; const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; - hwlmcb_rv_t rv = - roseRunProgram(rose, scratch, program, start, end, flags); + hwlmcb_rv_t rv; + if (rose->pureLiteral) { + rv = roseRunProgram_l(rose, scratch, program, start, end, flags); + } else { + rv = roseRunProgram(rose, scratch, program, start, end, flags); + } if (rv == HWLM_TERMINATE_MATCHING) { return MO_HALT_MATCHING; } diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 4238f2e4..0f2d1083 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -2884,6 +2884,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, assert(programOffset >= sizeof(struct RoseEngine)); assert(programOffset < t->size); + const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP; const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV; const char *pc_base = getByOffset(t, programOffset); @@ -2911,6 +2912,56 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(CHECK_GROUPS) { + DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n", + tctxt->groups, ri->groups); + if (!(ri->groups & tctxt->groups)) { + DEBUG_PRINTF("halt: no groups are set\n"); + return HWLM_CONTINUE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + DEBUG_PRINTF("failed mask check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK_32) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_BYTE) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, + ri->negation, ri->offset, end)) { + DEBUG_PRINTF("failed byte check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(PUSH_DELAYED) { + rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end); + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(CATCH_UP) { if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; @@ -2967,6 +3018,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(REPORT_CHAIN) { + // Note: sequence points updated inside this function. + if (roseCatchUpAndHandleChainMatch( + t, scratch, ri->event, ri->top_squash_distance, end, + in_catchup) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(REPORT) { updateSeqPoint(tctxt, end, from_mpv); if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, @@ -3117,6 +3179,24 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(INCLUDED_JUMP) { + if (scratch->fdr_conf) { + // squash the bucket of included literal + u8 shift = scratch->fdr_conf_offset & ~7U; + u64a mask = ((~(u64a)ri->squash) << shift); + *(scratch->fdr_conf) &= mask; + + pc = getByOffset(t, ri->child_offset); + pc_base = pc; + programOffset = (const u8 *)pc_base -(const u8 *)t; + DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n", + pc_base, pc, ri->child_offset, ri->squash); + work_done = 0; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(SET_LOGICAL) { DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n", ri->lkey, ri->offset_adjust); diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 908d13c1..5cbb5c84 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -2843,34 +2843,9 @@ vector groupByFragment(const RoseBuildImpl &build) { DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id, dumpString(lit.s).c_str()); - - /** 0:/xxabcdefgh/ */ - /** 1:/yyabcdefgh/ */ - /** 2:/yyabcdefgh.+/ */ - // Above 3 patterns should firstly convert into RoseLiteralMap with - // 2 elements ("xxabcdefgh" and "yyabcdefgh"), then convert into - // LitFragment with 1 element ("abcdefgh"). Special care should be - // taken to handle the 'pure' flag during the conversion. - - rose_literal_id lit_frag = getFragment(lit); - auto it = frag_info.find(lit_frag); - if (it != frag_info.end()) { - if (!lit_frag.s.get_pure() && it->first.s.get_pure()) { - struct FragmentInfo f_info = it->second; - f_info.lit_ids.push_back(lit_id); - f_info.groups |= groups; - frag_info.erase(it->first); - frag_info.emplace(lit_frag, f_info); - } else { - it->second.lit_ids.push_back(lit_id); - it->second.groups |= groups; - } - } else { - struct FragmentInfo f_info; - f_info.lit_ids.push_back(lit_id); - f_info.groups |= groups; - frag_info.emplace(lit_frag, f_info); - } + auto &fi = frag_info[getFragment(lit)]; + fi.lit_ids.push_back(lit_id); + fi.groups |= groups; } for (auto &m : frag_info) { diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h index fe48da4c..7780848b 100644 --- a/src/rose/rose_build_impl.h +++ b/src/rose/rose_build_impl.h @@ -340,14 +340,7 @@ public: std::pair insert(const rose_literal_id &lit) { auto it = lits_index.find(lit); if (it != lits_index.end()) { - u32 idx = it->second; - auto &l = lits.at(idx); - if (!lit.s.get_pure() && l.s.get_pure()) { - lits_index.erase(l); - l.s.unset_pure(); - lits_index.emplace(l, idx); - } - return {idx, false}; + return {it->second, false}; } u32 id = verify_u32(lits.size()); lits.push_back(lit); diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp index 8c532cab..4fde4c44 100644 --- a/src/rose/rose_build_matchers.cpp +++ b/src/rose/rose_build_matchers.cpp @@ -727,7 +727,6 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp, const auto &s_final = lit_final.get_string(); bool nocase = lit_final.any_nocase(); - bool pure = f.s.get_pure(); DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n", f.fragment_id, escapeString(s_final).c_str(), (int)nocase, @@ -741,7 +740,7 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp, const auto &groups = f.groups; mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id, - groups, msk, cmp, pure); + groups, msk, cmp); } static diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index ff24a9cc..7bd6779c 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -328,6 +328,7 @@ struct RoseBoundaryReports { * nfas). Rose nfa info table can distinguish the cases. */ struct RoseEngine { + u8 pureLiteral; /* Indicator of pure literal API */ u8 noFloatingRoots; /* only need to run the anchored table if something * matched in the anchored table */ u8 requiresEodCheck; /* stuff happens at eod time */ diff --git a/src/runtime.c b/src/runtime.c index ed1eaf53..a3659348 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -141,7 +141,6 @@ void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose, s->deduper.current_report_offset = ~0ULL; s->deduper.som_log_dirty = 1; /* som logs have not been cleared */ s->fdr_conf = NULL; - s->pure = 0; // Rose program execution (used for some report paths) depends on these // values being initialised. diff --git a/src/scratch.c b/src/scratch.c index c23b5b3c..b4630640 100644 --- a/src/scratch.c +++ b/src/scratch.c @@ -137,7 +137,6 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { s->scratchSize = alloc_size; s->scratch_alloc = (char *)s_tmp; s->fdr_conf = NULL; - s->pure = 0; // each of these is at an offset from the previous char *current = (char *)s + sizeof(*s); diff --git a/src/scratch.h b/src/scratch.h index e2e8039a..1256f7ab 100644 --- a/src/scratch.h +++ b/src/scratch.h @@ -211,7 +211,6 @@ struct ALIGN_CL_DIRECTIVE hs_scratch { u64a *fdr_conf; /**< FDR confirm value */ u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches * in buffer */ - u8 pure; /**< indicator of pure-literal or cutting-literal */ }; /* array of fatbit ptr; TODO: why not an array of fatbits? */ diff --git a/src/util/ue2string.cpp b/src/util/ue2string.cpp index 98b007d4..50b2bbcc 100644 --- a/src/util/ue2string.cpp +++ b/src/util/ue2string.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -283,7 +283,6 @@ ue2_literal &ue2_literal::erase(size_type pos, size_type n) { } void ue2_literal::push_back(char c, bool nc) { - assert(!nc || ourisalpha(c)); if (nc) { c = mytoupper(c); } diff --git a/src/util/ue2string.h b/src/util/ue2string.h index 1ce51b2f..0aa84689 100644 --- a/src/util/ue2string.h +++ b/src/util/ue2string.h @@ -211,17 +211,10 @@ public: size_t hash() const; - void set_pure() { pure = true; } - void unset_pure() { pure = false; } - bool get_pure() const { return pure; } - - /* TODO: consider existing member functions possibly related with pure. */ - private: friend const_iterator; std::string s; boost::dynamic_bitset<> nocase; - bool pure = false; /**< born from cutting or not (pure literal). */ }; /// Return a reversed copy of this literal. diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h index 820cad7c..7c2c8f9d 100644 --- a/tools/hsbench/common.h +++ b/tools/hsbench/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,6 +41,7 @@ extern unsigned int somPrecisionMode; extern bool forceEditDistance; extern unsigned editDistance; extern bool printCompressSize; +extern bool useLiteralApi; /** Structure for the result of a single complete scan. */ struct ResultEntry { diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp index 3390c263..c1f1e8c4 100644 --- a/tools/hsbench/engine_hyperscan.cpp +++ b/tools/hsbench/engine_hyperscan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -411,22 +411,30 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, ext_ptr[i] = &ext[i]; } - Timer timer; - timer.start(); - hs_compile_error_t *compile_err; + Timer timer; -#ifndef RELEASE_BUILD - err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(), - ext_ptr.data(), count, full_mode, nullptr, - &db, &compile_err, grey); -#else - err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.data(), - ext_ptr.data(), count, full_mode, nullptr, - &db, &compile_err); -#endif + if (useLiteralApi) { + // Pattern length computation should be done before timer start. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + timer.start(); + err = hs_compile_lit_multi_int(patterns.data(), flags.data(), + ids.data(), ext_ptr.data(), + lens.data(), count, full_mode, + nullptr, &db, &compile_err, grey); + timer.complete(); + } else { + timer.start(); + err = hs_compile_multi_int(patterns.data(), flags.data(), + ids.data(), ext_ptr.data(), count, + full_mode, nullptr, &db, &compile_err, + grey); + timer.complete(); + } - timer.complete(); compileSecs = timer.seconds(); peakMemorySize = getPeakHeap(); diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp index de9fde07..8e85d7ae 100644 --- a/tools/hsbench/main.cpp +++ b/tools/hsbench/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -87,6 +87,7 @@ unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; bool forceEditDistance = false; unsigned editDistance = 0; bool printCompressSize = false; +bool useLiteralApi = false; // Globals local to this file. static bool compressStream = false; @@ -218,6 +219,7 @@ void usage(const char *error) { printf(" --per-scan Display per-scan Mbit/sec results.\n"); printf(" --echo-matches Display all matches that occur during scan.\n"); printf(" --sql-out FILE Output sqlite db.\n"); + printf(" --literal-on Use Hyperscan pure literal matching.\n"); printf(" -S NAME Signature set name (for sqlite db).\n"); printf("\n\n"); @@ -250,6 +252,7 @@ void processArgs(int argc, char *argv[], vector &sigSets, int do_echo_matches = 0; int do_sql_output = 0; int option_index = 0; + int literalFlag = 0; vector sigFiles; static struct option longopts[] = { @@ -257,6 +260,7 @@ void processArgs(int argc, char *argv[], vector &sigSets, {"echo-matches", no_argument, &do_echo_matches, 1}, {"compress-stream", no_argument, &do_compress, 1}, {"sql-out", required_argument, &do_sql_output, 1}, + {"literal-on", no_argument, &literalFlag, 1}, {nullptr, 0, nullptr, 0} }; @@ -463,6 +467,8 @@ void processArgs(int argc, char *argv[], vector &sigSets, loadSignatureList(file, sigs); sigSets.emplace_back(file, move(sigs)); } + + useLiteralApi = (bool)literalFlag; } /** Start the global timer. */ diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp index 595c8b84..9cfe73df 100644 --- a/tools/hscheck/main.cpp +++ b/tools/hscheck/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -92,6 +92,7 @@ bool g_allSignatures = false; bool g_forceEditDistance = false; bool build_sigs = false; bool check_logical = false; +bool use_literal_api = false; unsigned int g_signature; unsigned int g_editDistance; unsigned int globalFlags = 0; @@ -322,11 +323,26 @@ void checkExpression(UNUSED void *threadarg) { #if !defined(RELEASE_BUILD) // This variant is available in non-release builds and allows us to // modify greybox settings. - err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err, *g_grey); + if (use_literal_api) { + size_t len = strlen(regexp); + err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp, + &len, 1, mode, nullptr, &db, + &compile_err, *g_grey); + } else { + err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, + mode, nullptr, &db, &compile_err, + *g_grey); + } #else - err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err); + if (use_literal_api) { + size_t len = strlen(regexp); + err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp, + &len, 1, mode, nullptr, &db, + &compile_err, *g_grey); + } else { + err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, + mode, nullptr, &db, &compile_err); + } #endif if (err == HS_SUCCESS) { @@ -381,6 +397,11 @@ void checkLogicalExpression(UNUSED void *threadarg) { ExprExtMap::const_iterator it; while (getNextLogicalExpression(it)) { + if (use_literal_api) { + recordSuccess(g_exprMap, it->first); + continue; + } + const ParsedExpr &comb = it->second; vector subIds; @@ -470,6 +491,7 @@ void usage() { << " -h Display this help." << endl << " -B Build signature set." << endl << " -C Check logical combinations (default: off)." << endl + << " --literal-on Processing pure literals, no need to check." << endl << endl; } @@ -477,9 +499,15 @@ static void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { const char options[] = "e:E:s:z:hHLNV8G:T:BC"; bool signatureSet = false; + int literalFlag = 0; + + static struct option longopts[] = { + {"literal-on", no_argument, &literalFlag, 1}, + {nullptr, 0, nullptr, 0} + }; for (;;) { - int c = getopt_long(argc, argv, options, nullptr, nullptr); + int c = getopt_long(argc, argv, options, longopts, nullptr); if (c < 0) { break; } @@ -539,6 +567,9 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { case 'C': check_logical = true; break; + case 0: + case 1: + break; default: usage(); exit(1); @@ -564,6 +595,8 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { usage(); exit(1); } + + use_literal_api = (bool)literalFlag; } static diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp index 5a4bdc00..f30a8f5e 100644 --- a/tools/hscollider/GroundTruth.cpp +++ b/tools/hscollider/GroundTruth.cpp @@ -43,6 +43,7 @@ #include "parser/Parser.h" #include "parser/parse_error.h" #include "util/make_unique.h" +#include "util/string_util.h" #include "util/unicode_def.h" #include "util/unordered.h" @@ -111,6 +112,15 @@ bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander, return false; } + if (use_literal_api) { + // filter out flags not supported by pure literal API. + u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 | + HS_FLAG_UCP | HS_FLAG_PREFILTER; + hs_flags &= ~not_supported; + force_utf8 = false; + force_prefilter = false; + } + expr.swap(regex); if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som, @@ -260,9 +270,29 @@ GroundTruth::compile(unsigned id, bool no_callouts) { throw PcreCompileFailure("Unable to decode flags."); } + // When hyperscan literal api is on, transfer the regex string into hex. + if (use_literal_api && !combination) { + unsigned char *pat + = reinterpret_cast(const_cast(re.c_str())); + char *str = makeHex(pat, re.length()); + if (!str) { + throw PcreCompileFailure("makeHex() malloc failure."); + } + re.assign(str); + free(str); + } + // filter out flags not supported by PCRE u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH; + if (use_literal_api) { + ext.flags &= 0ULL; + ext.min_offset = 0; + ext.max_offset = MAX_OFFSET; + ext.min_length = 0; + ext.edit_distance = 0; + ext.hamming_distance = 0; + } if (ext.flags & ~supported) { // edit distance is a known unsupported flag, so just throw a soft error if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) { @@ -314,7 +344,6 @@ GroundTruth::compile(unsigned id, bool no_callouts) { return compiled; } - compiled->bytecode = pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr); diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp index b7c77ee1..66ae270b 100644 --- a/tools/hscollider/NfaGeneratedCorpora.cpp +++ b/tools/hscollider/NfaGeneratedCorpora.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -32,6 +32,7 @@ #include "ng_corpus_generator.h" #include "NfaGeneratedCorpora.h" #include "ExpressionParser.h" +#include "common.h" #include "grey.h" #include "hs_compile.h" @@ -44,6 +45,7 @@ #include "util/compile_context.h" #include "util/compile_error.h" #include "util/report_manager.h" +#include "util/string_util.h" #include "util/target_info.h" #include @@ -80,6 +82,18 @@ void NfaGeneratedCorpora::generate(unsigned id, vector &data) { throw CorpusFailure("Expression could not be read: " + i->second); } + // When hyperscan literal api is on, transfer the regex string into hex. + if (use_literal_api && !(hs_flags & HS_FLAG_COMBINATION)) { + unsigned char *pat + = reinterpret_cast(const_cast(re.c_str())); + char *str = makeHex(pat, re.length()); + if (!str) { + throw CorpusFailure("makeHex() malloc failure."); + } + re.assign(str); + free(str); + } + // Combination's corpus is consist of sub-expressions' corpuses. if (hs_flags & HS_FLAG_COMBINATION) { ParsedLogical pl; diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp index c37e39ba..038fbf77 100644 --- a/tools/hscollider/UltimateTruth.cpp +++ b/tools/hscollider/UltimateTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -925,11 +925,22 @@ compileHyperscan(vector &patterns, vector &flags, const unsigned count = patterns.size(); hs_database_t *db = nullptr; hs_compile_error_t *compile_err; + hs_error_t err; - hs_error_t err = hs_compile_multi_int(&patterns[0], &flags[0], - &idsvec[0], ext.c_array(), count, - mode, platform, &db, - &compile_err, grey); + if (use_literal_api) { + // Compute length of each pattern. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + err = hs_compile_lit_multi_int(&patterns[0], &flags[0], &idsvec[0], + ext.c_array(), &lens[0], count, mode, + platform, &db, &compile_err, grey); + } else { + err = hs_compile_multi_int(&patterns[0], &flags[0], &idsvec[0], + ext.c_array(), count, mode, platform, &db, + &compile_err, grey); + } if (err != HS_SUCCESS) { error = compile_err->message; diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp index 3b515027..2eb510e0 100644 --- a/tools/hscollider/args.cpp +++ b/tools/hscollider/args.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -116,6 +116,7 @@ void usage(const char *name, const char *error) { printf(" --abort-on-fail Abort, rather than exit, on failure.\n"); printf(" --no-signal-handler Do not handle handle signals (to generate " "backtraces).\n"); + printf(" --literal-on Use Hyperscan pure literal matching.\n"); printf("\n"); printf("Memory and resource control options:\n"); printf("\n"); @@ -174,6 +175,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, int mangleScratch = 0; int compressFlag = 0; int compressResetFlag = 0; + int literalFlag = 0; static const struct option longopts[] = { {"copy-scratch", 0, ©Scratch, 1}, {"copy-stream", 0, ©Stream, 1}, @@ -187,6 +189,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, {"compress-expand", 0, &compressFlag, 1}, {"compress-reset-expand", 0, &compressResetFlag, 1}, {"no-groups", 0, &no_groups, 1}, + {"literal-on", 0, &literalFlag, 1}, {nullptr, 0, nullptr, 0}}; for (;;) { @@ -589,4 +592,5 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, use_mangle_scratch = (bool) mangleScratch; use_compress_expand = (bool)compressFlag; use_compress_reset_expand = (bool)compressResetFlag; + use_literal_api = (bool)literalFlag; } diff --git a/tools/hscollider/common.h b/tools/hscollider/common.h index d9a0144c..67e488c0 100644 --- a/tools/hscollider/common.h +++ b/tools/hscollider/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -82,6 +82,7 @@ extern bool use_copy_stream; extern bool use_mangle_scratch; extern bool use_compress_expand; extern bool use_compress_reset_expand; +extern bool use_literal_api; extern int abort_on_failure; extern int no_signal_handler; extern bool force_edit_distance; diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp index 18d7a016..afa6ef5a 100644 --- a/tools/hscollider/main.cpp +++ b/tools/hscollider/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -118,6 +118,7 @@ bool use_copy_stream = false; bool use_mangle_scratch = false; bool use_compress_expand = false; bool use_compress_reset_expand = false; +bool use_literal_api = false; int abort_on_failure = 0; int no_signal_handler = 0; size_t max_scan_queue_len = 25000; diff --git a/tools/hsdump/main.cpp b/tools/hsdump/main.cpp index 3221d1b6..75db1c4f 100644 --- a/tools/hsdump/main.cpp +++ b/tools/hsdump/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -106,6 +106,8 @@ bool dump_intermediate = true; bool force_edit_distance = false; u32 edit_distance = 0; +int use_literal_api = 0; + } // namespace // Usage statement. @@ -139,6 +141,7 @@ void usage(const char *name, const char *error) { printf(" -8 Force UTF8 mode on all patterns.\n"); printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n"); printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); + printf(" --literal-on Use Hyperscan pure literal matching API.\n"); printf("\n"); printf("Example:\n"); printf("$ %s -e pattern.file -s sigfile\n", name); @@ -163,6 +166,7 @@ void processArgs(int argc, char *argv[], Grey &grey) { {"utf8", no_argument, nullptr, '8'}, {"prefilter", no_argument, &force_prefilter, 1}, {"som-width", required_argument, nullptr, 'd'}, + {"literal-on", no_argument, &use_literal_api, 1}, {nullptr, 0, nullptr, 0} }; @@ -501,9 +505,23 @@ unsigned int dumpDataMulti(const vector &patterns, hs_database_t *db = nullptr; hs_compile_error_t *compile_err; - hs_error_t err = hs_compile_multi_int( - patterns.data(), flags.data(), ids.data(), ext.c_array(), - patterns.size(), mode, plat_info.get(), &db, &compile_err, grey); + hs_error_t err; + const size_t count = patterns.size(); + if (use_literal_api) { + // Compute length of each pattern. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + err = hs_compile_lit_multi_int(patterns.data(), flags.data(), + ids.data(), ext.c_array(), lens.data(), + count, mode, plat_info.get(), &db, + &compile_err, grey); + } else { + err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(), + ext.c_array(), count, mode, plat_info.get(), + &db, &compile_err, grey); + } if (err != HS_SUCCESS) { if (compile_err && compile_err->message) { diff --git a/util/string_util.h b/util/string_util.h index 658eb704..b44586ea 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -127,4 +127,18 @@ void prettyPrintRange(std::ostream &out, it_t begin, it_t end) { } } +// Transfer given string into a hex-escaped pattern. +static really_inline +char *makeHex(const unsigned char *pat, unsigned patlen) { + size_t hexlen = patlen * 4; + char *hexbuf = (char *)malloc(hexlen + 1); + unsigned i; + char *buf; + for (i = 0, buf = hexbuf; i < patlen; i++, buf += 4) { + snprintf(buf, 5, "\\x%02x", (unsigned char)pat[i]); + } + hexbuf[hexlen] = '\0'; + return hexbuf; +} + #endif // STRING_UTIL_H