From 356c0ab3d4a47010d5decf4242b9d1b72f73bbbd Mon Sep 17 00:00:00 2001 From: Derrick Lyndon Pallas Date: Fri, 19 Apr 2019 21:36:19 +0000 Subject: [PATCH 01/18] dispatcher: return correct function type from ifunc resolver --- src/dispatcher.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dispatcher.c b/src/dispatcher.c index 70b82277..a786b806 100644 --- a/src/dispatcher.c +++ b/src/dispatcher.c @@ -51,7 +51,7 @@ } \ \ /* resolver */ \ - static void(*JOIN(resolve_, NAME)(void)) { \ + static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \ if (check_avx512()) { \ return JOIN(avx512_, NAME); \ } \ From e15954a4bd02641b80f3e5cb9c71d8d80effd001 Mon Sep 17 00:00:00 2001 From: Derrick Lyndon Pallas Date: Mon, 22 Apr 2019 20:19:14 +0000 Subject: [PATCH 02/18] Avoid array-bounds error when debug/fortify enabled This code causes GCC to error out due to a bounds error with the following set -D_GLIBCXX_DEBUG -D_FORTIFY_SOURCE=2 The solution is to copy via iterator. --- util/expression_path.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/util/expression_path.h b/util/expression_path.h index 3075b4d4..ac4ca97d 100644 --- a/util/expression_path.h +++ b/util/expression_path.h @@ -56,9 +56,8 @@ std::string inferExpressionPath(const std::string &sigFile) { // POSIX variant. // dirname() may modify its argument, so we must make a copy. - std::vector path(sigFile.size() + 1); - memcpy(path.data(), sigFile.c_str(), sigFile.size()); - path[sigFile.size()] = 0; // ensure null termination. + std::vector path(sigFile.begin(), sigFile.end()); + path.push_back(0); // ensure null termination. std::string rv = dirname(path.data()); #else From 5a1b02bc10547dbb8f0e5c0fccb7373fa55d0979 Mon Sep 17 00:00:00 2001 From: Derrick Lyndon Pallas Date: Mon, 22 Apr 2019 21:13:52 +0000 Subject: [PATCH 03/18] Fix uninitialized use of scatter_unit_uX due to padding These non-packed structures are placed into a std::vector. Later, they contents of the vector are memcpy'd and the CRC of this space is taken. Some compilers will zero the struct padding but GCC8.2 with -O2 at least will not. This means that the CRC is based on uninitialized memory. Since it is expected that these bytes will be memcpy'd, zero in place once they're in the std::vector. Found by Valgrind. Q.v. Issue #148 --- src/util/multibit_build.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/multibit_build.cpp b/src/util/multibit_build.cpp index ad6a0d6a..67bb9ec7 100644 --- a/src/util/multibit_build.cpp +++ b/src/util/multibit_build.cpp @@ -192,11 +192,11 @@ vector mmbBuildSparseIterator(const vector &bits, template static void add_scatter(vector *out, u32 offset, u64a mask) { - T su; + out->emplace_back(); + T &su = out->back(); memset(&su, 0, sizeof(su)); su.offset = offset; su.val = mask; - out->push_back(su); DEBUG_PRINTF("add %llu at offset %u\n", mask, offset); } From 92edc37c1f1cc509ce65cfc50bb41014aa0ba044 Mon Sep 17 00:00:00 2001 From: Mostafa Nazari Date: Wed, 8 May 2019 09:42:48 +0430 Subject: [PATCH 04/18] BUGFIX: fix Numerical result out of range error Fix Error errno=34, fix Numerical result out of range error issue: https://github.com/intel/hyperscan/issues/155 --- src/fdr/fdr_compile.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index 65c5020e..39cbc335 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -282,8 +282,8 @@ const array Scorer::count_lut{{ }}; const array Scorer::len_lut{{ - pow(0, -3.0), pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0), - pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}}; + 0, pow(1, -3.0), pow(2, -3.0), pow(3, -3.0), pow(4, -3.0), + pow(5, -3.0), pow(6, -3.0), pow(7, -3.0), pow(8, -3.0)}}; /** * Returns true if the two given literals should be placed in the same chunk as From f28feee57d3e7525450aab042d6c6f12f9bead7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Thu, 30 May 2019 11:31:29 -0700 Subject: [PATCH 05/18] unit: avoid UB by making integer literal explicitally unsigned reported by cppcheck as: [unit/internal/uniform_ops.cpp:78]: (error) Shifting signed 32-bit value by 31 bits is undefined behaviour [unit/internal/uniform_ops.cpp:109]: (error) Shifting signed 32-bit value by 31 bits is undefined behaviour [unit/internal/uniform_ops.cpp:127]: (error) Shifting signed 32-bit value by 31 bits is undefined behaviour [unit/internal/uniform_ops.cpp:145]: (error) Shifting signed 32-bit value by 31 bits is undefined behaviour --- unit/internal/uniform_ops.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unit/internal/uniform_ops.cpp b/unit/internal/uniform_ops.cpp index 10defdbd..7d394e02 100644 --- a/unit/internal/uniform_ops.cpp +++ b/unit/internal/uniform_ops.cpp @@ -75,7 +75,7 @@ TEST(Uniform, loadstore_u16) { TEST(Uniform, loadstore_u32) { for (int i = 0; i < 32; i++) { - u32 in = 1 << i; + u32 in = 1U << i; const char *cin = (const char *)(&in); u32 out = load_u32(cin); EXPECT_EQ(in, out); @@ -106,7 +106,7 @@ TEST(Uniform, loadstore_m128) { } in; for (int i = 0; i < 128; i++) { memset(&in, 0, sizeof(in)); - in.words[i/32] = 1 << (i % 32); + in.words[i/32] = 1U << (i % 32); const char *cin = (const char *)(&in); m128 out = load_m128(cin); EXPECT_EQ(0, memcmp(&out, &in, sizeof(out))); @@ -124,7 +124,7 @@ TEST(Uniform, loadstore_m256) { } in; for (int i = 0; i < 256; i++) { memset(&in, 0, sizeof(in)); - in.words[i/32] = 1 << (i % 32); + in.words[i/32] = 1U << (i % 32); const char *cin = (const char *)(&in); m256 out = load_m256(cin); EXPECT_EQ(0, memcmp(&out, &in, sizeof(out))); @@ -142,7 +142,7 @@ TEST(Uniform, loadstore_m512) { } in; for (int i = 0; i < 512; i++) { memset(&in, 0, sizeof(in)); - in.words[i/32] = 1 << (i % 32); + in.words[i/32] = 1U << (i % 32); const char *cin = (const char *)(&in); m512 out = load_m512(cin); EXPECT_EQ(0, memcmp(&out, &in, sizeof(out))); From 7ea4e06275f0178461ac8417c5f12233f3064023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Mon, 3 Jun 2019 15:32:38 -0700 Subject: [PATCH 06/18] tools: hscollider FTBS in alpine linux alpine uses musl instead of glibc and therefore doesn't have backtrace() as part of its libc. POSIX mandates that _exit() be defined through unistd.h which used to be included together with execinfo.h when backtrace() was detected and therefore it happened to build fine for linux or freebsd (when using libexecinfo from the system or ports). since there was a macro already defined to test for unistd.h use that instead and decouple this dependency, so that the code could be built even when no backtrace() is provided (as expected also in OpenBSD) --- tools/hscollider/sig.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp index dc815140..7d580e41 100644 --- a/tools/hscollider/sig.cpp +++ b/tools/hscollider/sig.cpp @@ -42,7 +42,10 @@ #ifdef HAVE_BACKTRACE #include -#include +#endif + +#ifdef HAVE_UNISTD_H +#include // for _exit #endif #define BACKTRACE_BUFFER_SIZE 200 From 1f4c10a58d94d73b32bf5b7c9682329c7c1755cf Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Wed, 27 Mar 2019 15:19:14 +0800 Subject: [PATCH 07/18] Logical combination: support EOD match from purely negative case. --- src/parser/logical_combination.cpp | 43 +--------- src/report.h | 54 +++++++++++- src/rose/match.c | 17 ++++ src/rose/program_runtime.c | 72 +++++++++++++++- src/rose/rose.h | 5 +- src/rose/rose_build_bytecode.cpp | 13 +++ src/rose/rose_build_dump.cpp | 26 +++++- src/rose/rose_build_instructions.cpp | 3 +- src/rose/rose_build_instructions.h | 10 ++- src/rose/rose_build_program.cpp | 6 +- src/rose/rose_build_program.h | 3 +- src/rose/rose_internal.h | 4 +- src/rose/rose_program.h | 15 +++- src/runtime.c | 37 ++------ tools/hscollider/GraphTruth.cpp | 49 ++++++++++- tools/hscollider/GroundTruth.cpp | 49 ++++++++++- unit/hyperscan/bad_patterns.txt | 11 +-- unit/hyperscan/logical_combination.cpp | 112 ++++++++++++++++++++++++- 18 files changed, 437 insertions(+), 92 deletions(-) diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp index b78390b0..49e060c9 100644 --- a/src/parser/logical_combination.cpp +++ b/src/parser/logical_combination.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Intel Corporation + * Copyright (c) 2018-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -254,44 +254,6 @@ void popOperator(vector &op_stack, vector &subid_stack, op_stack.pop_back(); } -static -char getValue(const vector &lv, u32 ckey) { - if (ckey & LOGICAL_OP_BIT) { - return lv[ckey & ~LOGICAL_OP_BIT]; - } else { - return 0; - } -} - -static -bool hasMatchFromPurelyNegative(const vector &tree, - u32 start, u32 result) { - vector lv(tree.size()); - assert(start <= result); - for (u32 i = start; i <= result; i++) { - assert(i & LOGICAL_OP_BIT); - const LogicalOp &op = tree[i & ~LOGICAL_OP_BIT]; - assert(i == op.id); - switch (op.op) { - case LOGICAL_OP_NOT: - lv[op.id & ~LOGICAL_OP_BIT] = !getValue(lv, op.ro); - break; - case LOGICAL_OP_AND: - lv[op.id & ~LOGICAL_OP_BIT] = getValue(lv, op.lo) & - getValue(lv, op.ro); - break; - case LOGICAL_OP_OR: - lv[op.id & ~LOGICAL_OP_BIT] = getValue(lv, op.lo) | - getValue(lv, op.ro); - break; - default: - assert(0); - break; - } - } - return lv[result & ~LOGICAL_OP_BIT]; -} - void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical, u32 ekey, u64a min_offset, u64a max_offset) { @@ -366,9 +328,6 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical, if (lkey_start == INVALID_LKEY) { throw CompileError("No logical operation."); } - if (hasMatchFromPurelyNegative(logicalTree, lkey_start, lkey_result)) { - throw CompileError("Has match from purely negative sub-expressions."); - } combinationInfoAdd(ckey, id, ekey, lkey_start, lkey_result, min_offset, max_offset); } diff --git a/src/report.h b/src/report.h index a2e2d0f3..b35f4c05 100644 --- a/src/report.h +++ b/src/report.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -222,6 +222,58 @@ char isLogicalCombination(const struct RoseEngine *rose, char *lvec, return getLogicalVal(rose, lvec, result); } +/** \brief Returns 1 if combination matches when no sub-expression matches. */ +static really_inline +char isPurelyNegativeMatch(const struct RoseEngine *rose, char *lvec, + u32 start, u32 result) { + const struct LogicalOp *logicalTree = (const struct LogicalOp *) + ((const char *)rose + rose->logicalTreeOffset); + assert(start >= rose->lkeyCount); + assert(start <= result); + assert(result < rose->lkeyCount + rose->lopCount); + for (u32 i = start; i <= result; i++) { + const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount); + assert(i == op->id); + assert(op->op <= LAST_LOGICAL_OP); + switch ((enum LogicalOpType)op->op) { + case LOGICAL_OP_NOT: + if ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro)) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + !getLogicalVal(rose, lvec, op->ro)); + break; + case LOGICAL_OP_AND: + if (((op->lo < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->lo)) || + ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro))) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) & + getLogicalVal(rose, lvec, op->ro)); // && + break; + case LOGICAL_OP_OR: + if (((op->lo < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->lo)) || + ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro))) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) | + getLogicalVal(rose, lvec, op->ro)); // || + break; + } + } + return getLogicalVal(rose, lvec, result); +} + /** \brief Clear all keys in the logical vector. */ static really_inline void clearLvec(const struct RoseEngine *rose, char *lvec, char *cvec) { diff --git a/src/rose/match.c b/src/rose/match.c index 192b4709..c91b2a50 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -591,6 +591,23 @@ int roseRunFlushCombProgram(const struct RoseEngine *rose, return MO_CONTINUE_MATCHING; } +/** + * \brief Execute last flush combination program. + * + * Returns MO_HALT_MATCHING if the stream is exhausted or the user has + * instructed us to halt, or MO_CONTINUE_MATCHING otherwise. + */ +int roseRunLastFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end) { + hwlmcb_rv_t rv = roseRunProgram(rose, scratch, + rose->lastFlushCombProgramOffset, + 0, end, 0); + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + return MO_CONTINUE_MATCHING; +} + int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) { struct hs_scratch *scratch = context; assert(scratch && scratch->magic == SCRATCH_MAGIC); diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 7f5150e0..4c487062 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -1875,6 +1875,49 @@ hwlmcb_rv_t flushActiveCombinations(const struct RoseEngine *t, return HWLM_CONTINUE_MATCHING; } +static rose_inline +hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end) { + for (u32 i = 0; i < t->ckeyCount; i++) { + const struct CombInfo *combInfoMap = (const struct CombInfo *) + ((const char *)t + t->combInfoMapOffset); + const struct CombInfo *ci = combInfoMap + i; + if ((ci->min_offset != 0) && (end < ci->min_offset)) { + DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset); + continue; + } + if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) { + DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset); + continue; + } + + DEBUG_PRINTF("check ekey %u\n", ci->ekey); + if (ci->ekey != INVALID_EKEY) { + assert(ci->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ci->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ci->ekey); + continue; + } + } + + DEBUG_PRINTF("check ckey %u purely negative\n", i); + char *lvec = scratch->core_info.logicalVector; + if (!isPurelyNegativeMatch(t, lvec, ci->start, ci->result)) { + DEBUG_PRINTF("Logical Combination from purely negative Failed!\n"); + continue; + } + + DEBUG_PRINTF("Logical Combination from purely negative Passed!\n"); + if (roseReport(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + return HWLM_CONTINUE_MATCHING; +} + #if !defined(_WIN32) #define PROGRAM_CASE(name) \ case ROSE_INSTR_##name: { \ @@ -2004,7 +2047,8 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, &&LABEL_ROSE_INSTR_SET_LOGICAL, &&LABEL_ROSE_INSTR_SET_COMBINATION, &&LABEL_ROSE_INSTR_FLUSH_COMBINATION, - &&LABEL_ROSE_INSTR_SET_EXHAUST + &&LABEL_ROSE_INSTR_SET_EXHAUST, + &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION }; #endif @@ -2772,6 +2816,19 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(LAST_FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + if (checkPurelyNegatives(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + default: { assert(0); // unreachable scratch->core_info.status |= STATUS_ERROR; @@ -3082,6 +3139,19 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(LAST_FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + if (checkPurelyNegatives(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + default: { assert(0); // unreachable scratch->core_info.status |= STATUS_ERROR; diff --git a/src/rose/rose.h b/src/rose/rose.h index c2b682f6..409b7002 100644 --- a/src/rose/rose.h +++ b/src/rose/rose.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -56,4 +56,7 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program, int roseRunFlushCombProgram(const struct RoseEngine *rose, struct hs_scratch *scratch, u64a end); +int roseRunLastFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end); + #endif // ROSE_H diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 0ef20f21..908d13c1 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -3370,6 +3370,15 @@ RoseProgram makeFlushCombProgram(const RoseEngine &t) { return program; } +static +RoseProgram makeLastFlushCombProgram(const RoseEngine &t) { + RoseProgram program; + if (t.ckeyCount) { + addLastFlushCombinationProgram(program); + } + return program; +} + static u32 history_required(const rose_literal_id &key) { if (key.msk.size() < key.s.length()) { @@ -3740,6 +3749,10 @@ bytecode_ptr RoseBuildImpl::buildFinalEngine(u32 minWidth) { auto flushComb_prog = makeFlushCombProgram(proto); proto.flushCombProgramOffset = writeProgram(bc, move(flushComb_prog)); + auto lastFlushComb_prog = makeLastFlushCombProgram(proto); + proto.lastFlushCombProgramOffset = + writeProgram(bc, move(lastFlushComb_prog)); + // Build anchored matcher. auto atable = buildAnchoredMatcher(*this, fragments, anchored_dfas); if (atable) { diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp index 2eb7bb51..8999daef 100644 --- a/src/rose/rose_build_dump.cpp +++ b/src/rose/rose_build_dump.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -1486,6 +1486,9 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) { } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(LAST_FLUSH_COMBINATION) {} + PROGRAM_NEXT_INSTRUCTION + default: os << " UNKNOWN (code " << int{code} << ")" << endl; os << " " << endl; @@ -1557,6 +1560,25 @@ void dumpRoseFlushCombPrograms(const RoseEngine *t, const string &filename) { os.close(); } +static +void dumpRoseLastFlushCombPrograms(const RoseEngine *t, + const string &filename) { + ofstream os(filename); + const char *base = (const char *)t; + + if (t->lastFlushCombProgramOffset) { + os << "Last Flush Combination Program @ " + << t->lastFlushCombProgramOffset + << ":" << endl; + dumpProgram(os, t, base + t->lastFlushCombProgramOffset); + os << endl; + } else { + os << "" << endl; + } + + os.close(); +} + static void dumpRoseReportPrograms(const RoseEngine *t, const string &filename) { ofstream os(filename); @@ -2249,6 +2271,8 @@ void roseDumpPrograms(const vector &fragments, const RoseEngine *t, dumpRoseLitPrograms(fragments, t, base + "/rose_lit_programs.txt"); dumpRoseEodPrograms(t, base + "/rose_eod_programs.txt"); dumpRoseFlushCombPrograms(t, base + "/rose_flush_comb_programs.txt"); + dumpRoseLastFlushCombPrograms(t, + base + "/rose_last_flush_comb_programs.txt"); dumpRoseReportPrograms(t, base + "/rose_report_programs.txt"); dumpRoseAnchoredPrograms(t, base + "/rose_anchored_programs.txt"); dumpRoseDelayPrograms(t, base + "/rose_delay_programs.txt"); diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp index 2fe53455..c503f731 100644 --- a/src/rose/rose_build_instructions.cpp +++ b/src/rose/rose_build_instructions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, Intel Corporation + * Copyright (c) 2017-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -48,6 +48,7 @@ RoseInstrMatcherEod::~RoseInstrMatcherEod() = default; RoseInstrEnd::~RoseInstrEnd() = default; RoseInstrClearWorkDone::~RoseInstrClearWorkDone() = default; RoseInstrFlushCombination::~RoseInstrFlushCombination() = default; +RoseInstrLastFlushCombination::~RoseInstrLastFlushCombination() = default; using OffsetMap = RoseInstruction::OffsetMap; diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h index 61e6d7a6..306a4166 100644 --- a/src/rose/rose_build_instructions.h +++ b/src/rose/rose_build_instructions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, Intel Corporation + * Copyright (c) 2017-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -2206,6 +2206,14 @@ public: ~RoseInstrFlushCombination() override; }; +class RoseInstrLastFlushCombination + : public RoseInstrBaseTrivial { +public: + ~RoseInstrLastFlushCombination() override; +}; + class RoseInstrSetExhaust : public RoseInstrBaseNoTargets()); } +void addLastFlushCombinationProgram(RoseProgram &program) { + program.add_before_end(make_unique()); +} + static void makeRoleCheckLeftfix(const RoseBuildImpl &build, const map &leftfix_info, diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h index 8c8c37ed..7d781f31 100644 --- a/src/rose/rose_build_program.h +++ b/src/rose/rose_build_program.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -188,6 +188,7 @@ void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program); void addSuffixesEodProgram(RoseProgram &program); void addMatcherEodProgram(RoseProgram &program); void addFlushCombinationProgram(RoseProgram &program); +void addLastFlushCombinationProgram(RoseProgram &program); static constexpr u32 INVALID_QUEUE = ~0U; diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index 386b035c..ff24a9cc 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -426,6 +426,8 @@ struct RoseEngine { u32 eodProgramOffset; //!< EOD program, otherwise 0. u32 flushCombProgramOffset; /**< FlushCombination program, otherwise 0 */ + u32 lastFlushCombProgramOffset; /**< LastFlushCombination program, + * otherwise 0 */ u32 lastByteHistoryIterOffset; // if non-zero diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h index 7feee04f..e5485476 100644 --- a/src/rose/rose_program.h +++ b/src/rose/rose_program.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -201,7 +201,14 @@ enum RoseInstructionCode { /** \brief Mark as exhausted instead of report while quiet. */ ROSE_INSTR_SET_EXHAUST, - LAST_ROSE_INSTRUCTION = ROSE_INSTR_SET_EXHAUST //!< Sentinel. + /** + * \brief Calculate any combination's logical value if none of its + * sub-expression matches until EOD, then check if compliant with any + * logical constraints. + */ + ROSE_INSTR_LAST_FLUSH_COMBINATION, + + LAST_ROSE_INSTRUCTION = ROSE_INSTR_LAST_FLUSH_COMBINATION //!< Sentinel. }; struct ROSE_STRUCT_END { @@ -674,4 +681,8 @@ struct ROSE_STRUCT_SET_EXHAUST { u8 code; //!< From enum RoseInstructionCode. u32 ekey; //!< Exhaustion key. }; + +struct ROSE_STRUCT_LAST_FLUSH_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. +}; #endif // ROSE_ROSE_PROGRAM_H diff --git a/src/runtime.c b/src/runtime.c index cfcd0f7c..43cdab09 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -455,8 +455,9 @@ set_retval: return HS_UNKNOWN_ERROR; } - if (rose->flushCombProgramOffset) { - if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { + if (rose->lastFlushCombProgramOffset) { + if (roseRunLastFlushCombProgram(rose, scratch, length) + == MO_HALT_MATCHING) { if (unlikely(internal_matching_error(scratch))) { unmarkScratchInUse(scratch); return HS_UNKNOWN_ERROR; @@ -698,8 +699,9 @@ void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, } } - if (rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) { - if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { + if (rose->lastFlushCombProgramOffset && !told_to_stop_matching(scratch)) { + if (roseRunLastFlushCombProgram(rose, scratch, id->offset) + == MO_HALT_MATCHING) { DEBUG_PRINTF("told to stop matching\n"); scratch->core_info.status |= STATUS_TERMINATED; } @@ -1000,31 +1002,22 @@ hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch, if (onEvent) { if (!scratch || !validScratch(id->rose, scratch)) { + hs_stream_free(id); return HS_INVALID; } if (unlikely(markScratchInUse(scratch))) { + hs_stream_free(id); return HS_SCRATCH_IN_USE; } report_eod_matches(id, scratch, onEvent, context); if (unlikely(internal_matching_error(scratch))) { unmarkScratchInUse(scratch); + hs_stream_free(id); return HS_UNKNOWN_ERROR; } unmarkScratchInUse(scratch); } - if (id->rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) { - if (roseRunFlushCombProgram(id->rose, scratch, ~0ULL) - == MO_HALT_MATCHING) { - scratch->core_info.status |= STATUS_TERMINATED; - if (unlikely(internal_matching_error(scratch))) { - unmarkScratchInUse(scratch); - return HS_UNKNOWN_ERROR; - } - unmarkScratchInUse(scratch); - } - } - hs_stream_free(id); return HS_SUCCESS; @@ -1054,18 +1047,6 @@ hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags, unmarkScratchInUse(scratch); } - if (id->rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) { - if (roseRunFlushCombProgram(id->rose, scratch, ~0ULL) - == MO_HALT_MATCHING) { - scratch->core_info.status |= STATUS_TERMINATED; - if (unlikely(internal_matching_error(scratch))) { - unmarkScratchInUse(scratch); - return HS_UNKNOWN_ERROR; - } - unmarkScratchInUse(scratch); - } - } - // history already initialised init_stream(id, id->rose, 0); diff --git a/tools/hscollider/GraphTruth.cpp b/tools/hscollider/GraphTruth.cpp index b4b3f809..0b67b11c 100644 --- a/tools/hscollider/GraphTruth.cpp +++ b/tools/hscollider/GraphTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -299,6 +299,46 @@ char isLogicalCombination(vector &lv, const vector &comb, return lv[result]; } +/** \brief Returns 1 if combination matches when no sub-expression matches. */ +static +char isPurelyNegativeMatch(vector &lv, const vector &comb, + size_t lkeyCount, unsigned start, unsigned result) { + assert(start <= result); + for (unsigned i = start; i <= result; i++) { + const LogicalOp &op = comb[i - lkeyCount]; + assert(i == op.id); + switch (op.op) { + case LOGICAL_OP_NOT: + if ((op.ro < lkeyCount) && lv[op.ro]) { + // sub-expression not negative + return 0; + } + lv[op.id] = !lv[op.ro]; + break; + case LOGICAL_OP_AND: + if (((op.lo < lkeyCount) && lv[op.lo]) || + ((op.ro < lkeyCount) && lv[op.ro])) { + // sub-expression not negative + return 0; + } + lv[op.id] = lv[op.lo] & lv[op.ro]; // && + break; + case LOGICAL_OP_OR: + if (((op.lo < lkeyCount) && lv[op.lo]) || + ((op.ro < lkeyCount) && lv[op.ro])) { + // sub-expression not negative + return 0; + } + lv[op.id] = lv[op.lo] | lv[op.ro]; // || + break; + default: + assert(0); + break; + } + } + return lv[result]; +} + bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi, const string &buffer, ResultSet &rs, string &error) { if (cngi.quiet) { @@ -359,6 +399,13 @@ bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi, } } } + if (isPurelyNegativeMatch(lv, comb, m_lkey.size(), + li.start, li.result)) { + u64a to = buffer.length(); + if ((to >= cngi.min_offset) && (to <= cngi.max_offset)) { + rs.addMatch(0, to); + } + } return true; } diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp index fe038c81..5a4bdc00 100644 --- a/tools/hscollider/GroundTruth.cpp +++ b/tools/hscollider/GroundTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -557,6 +557,46 @@ char isLogicalCombination(vector &lv, const vector &comb, return lv[result]; } +/** \brief Returns 1 if combination matches when no sub-expression matches. */ +static +char isPurelyNegativeMatch(vector &lv, const vector &comb, + size_t lkeyCount, unsigned start, unsigned result) { + assert(start <= result); + for (unsigned i = start; i <= result; i++) { + const LogicalOp &op = comb[i - lkeyCount]; + assert(i == op.id); + switch (op.op) { + case LOGICAL_OP_NOT: + if ((op.ro < lkeyCount) && lv[op.ro]) { + // sub-expression not negative + return 0; + } + lv[op.id] = !lv[op.ro]; + break; + case LOGICAL_OP_AND: + if (((op.lo < lkeyCount) && lv[op.lo]) || + ((op.ro < lkeyCount) && lv[op.ro])) { + // sub-expression not negative + return 0; + } + lv[op.id] = lv[op.lo] & lv[op.ro]; // && + break; + case LOGICAL_OP_OR: + if (((op.lo < lkeyCount) && lv[op.lo]) || + ((op.ro < lkeyCount) && lv[op.ro])) { + // sub-expression not negative + return 0; + } + lv[op.id] = lv[op.lo] | lv[op.ro]; // || + break; + default: + assert(0); + break; + } + } + return lv[result]; +} + bool GroundTruth::run(unsigned, const CompiledPcre &compiled, const string &buffer, ResultSet &rs, string &error) { if (compiled.quiet) { @@ -616,6 +656,13 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, } } } + if (isPurelyNegativeMatch(lv, comb, m_lkey.size(), + li.start, li.result)) { + u64a to = buffer.length(); + if ((to >= compiled.min_offset) && (to <= compiled.max_offset)) { + rs.addMatch(0, to); + } + } return true; } diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 6d4283da..c4a9f13c 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -155,11 +155,6 @@ 158:/141 & (142|!143) )| 144/C #Not enough left parentheses at index 17. 159:/1234567890 & (142|!143 )/C #Expression id too large at index 10. 160:/141 & (142|!143 )|/C #Not enough operand at index 18. -161:/!141/C #Has match from purely negative sub-expressions. -162:/!141 | 142 | 143/C #Has match from purely negative sub-expressions. -163:/!141 & !142 & !143/C #Has match from purely negative sub-expressions. -164:/(141 | !142 & !143)/C #Has match from purely negative sub-expressions. -165:/!(141 | 142 | 143)/C #Has match from purely negative sub-expressions. -166:/141/C #No logical operation. -167:/119 & 121/C #Unknown sub-expression id. -168:/166 & 167/C #Unknown sub-expression id. +161:/141/C #No logical operation. +162:/119 & 121/C #Unknown sub-expression id. +163:/166 & 167/C #Unknown sub-expression id. diff --git a/unit/hyperscan/logical_combination.cpp b/unit/hyperscan/logical_combination.cpp index 169de333..5b1c1ec2 100644 --- a/unit/hyperscan/logical_combination.cpp +++ b/unit/hyperscan/logical_combination.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Intel Corporation + * Copyright (c) 2018-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -694,3 +694,113 @@ TEST(LogicalCombination, MultiCombQuietUniSub5) { err = hs_free_scratch(scratch); ASSERT_EQ(HS_SUCCESS, err); } + +TEST(LogicalCombination, SingleCombPurelyNegative6) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(!201 | 202 & 203) & (!204 | 205)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION}; + unsigned ids[] = {201, 202, 203, 204, 205, 1002}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(1U, c.matches.size()); + ASSERT_EQ(MatchRecord(53, 1002), c.matches[0]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, SingleCombQuietPurelyNegative6) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "(!201 | 202 & 203) & (!204 | 205)"}; + unsigned flags[] = {0, 0, 0, 0, 0, HS_FLAG_COMBINATION | HS_FLAG_QUIET}; + unsigned ids[] = {201, 202, 203, 204, 205, 1002}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(0U, c.matches.size()); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(LogicalCombination, MultiCombPurelyNegativeUniSub6) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "-----------------------------------------------" + "xxxfedxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "-----------------------------------------------" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "------------------------------------------"; + const char *expr[] = {"abc", "def", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "cba", "fed", "google.*cn", + "haystacks{4,8}", "ijkl[oOp]", "cab", "fee", + "goobar.*jp", "shockwave{4,6}", "ijkl[rRs]", + "(101 & 102 & 103) | (!104 & !105)", + "(!201 | 202 & 203) & (!204 | 205)", + "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301, + 302, 303, 304, 305, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(3U, c.matches.size()); + ASSERT_EQ(MatchRecord(106, 202), c.matches[0]); + ASSERT_EQ(MatchRecord(106, 1002), c.matches[1]); + ASSERT_EQ(MatchRecord(300, 1001), c.matches[2]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} From 64ea43ea39d4b0396942e06702377a0cb079350a Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Thu, 6 Jun 2019 15:12:24 +0800 Subject: [PATCH 08/18] Logical Combination: avoid corruption of pending combination report in streaming mode. Fixes github issue #165 --- src/rose/program_runtime.c | 27 +++++++++++++++++++++++---- src/runtime.c | 8 ++++++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 4c487062..4238f2e4 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -480,6 +480,25 @@ hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch, return roseHaltIfExhausted(t, scratch); } +static rose_inline +hwlmcb_rv_t roseReportComb(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end, + ReportID onmatch, s32 offset_adjust, u32 ekey) { + DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end); + + int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey); + if (cb_rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("termination requested\n"); + return HWLM_TERMINATE_MATCHING; + } + + if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return HWLM_CONTINUE_MATCHING; + } + + return roseHaltIfExhausted(t, scratch); +} + /* catches up engines enough to ensure any earlier mpv triggers are enqueued * and then adds the trigger to the mpv queue. */ static rose_inline @@ -1866,8 +1885,8 @@ hwlmcb_rv_t flushActiveCombinations(const struct RoseEngine *t, } DEBUG_PRINTF("Logical Combination Passed!\n"); - if (roseReport(t, scratch, end, ci->id, 0, - ci->ekey) == HWLM_TERMINATE_MATCHING) { + if (roseReportComb(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; } } @@ -1910,8 +1929,8 @@ hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t, } DEBUG_PRINTF("Logical Combination from purely negative Passed!\n"); - if (roseReport(t, scratch, end, ci->id, 0, - ci->ekey) == HWLM_TERMINATE_MATCHING) { + if (roseReportComb(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; } } diff --git a/src/runtime.c b/src/runtime.c index 43cdab09..078c8821 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -652,7 +652,9 @@ void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, scratch->core_info.logicalVector = state + rose->stateOffsets.logicalVec; scratch->core_info.combVector = state + rose->stateOffsets.combVec; - scratch->tctxt.lastCombMatchOffset = id->offset; + if (!id->offset) { + scratch->tctxt.lastCombMatchOffset = id->offset; + } } if (rose->somLocationCount) { @@ -908,7 +910,9 @@ hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data, scratch->core_info.logicalVector = state + rose->stateOffsets.logicalVec; scratch->core_info.combVector = state + rose->stateOffsets.combVec; - scratch->tctxt.lastCombMatchOffset = id->offset; + if (!id->offset) { + scratch->tctxt.lastCombMatchOffset = id->offset; + } } assert(scratch->core_info.hlen <= id->offset && scratch->core_info.hlen <= rose->historyRequired); From fdc3c290b668ce7a639515c4a844c1244d602570 Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Fri, 7 Jun 2019 11:38:37 +0800 Subject: [PATCH 09/18] Logical combination: add streaming mode unit test MultiCombStream1. --- unit/hyperscan/logical_combination.cpp | 63 ++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/unit/hyperscan/logical_combination.cpp b/unit/hyperscan/logical_combination.cpp index 5b1c1ec2..49854be1 100644 --- a/unit/hyperscan/logical_combination.cpp +++ b/unit/hyperscan/logical_combination.cpp @@ -804,3 +804,66 @@ TEST(LogicalCombination, MultiCombPurelyNegativeUniSub6) { err = hs_free_scratch(scratch); ASSERT_EQ(HS_SUCCESS, err); } + +TEST(LogicalCombination, MultiCombStream1) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data[] = {"xxxxxxxabcxxxxxxxdefxxxghixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxghixxxxxxxxxxxabcxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxdefxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxyzxxxxxxxxxxxxxxxxxxxxxghixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxghixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxzxy", + "z"}; + const char *expr[] = {"abc", "def", "xyz", "zxyz", + "101 & 102", "201 & !202"}; + unsigned flags[] = {0, 0, 0, 0, HS_FLAG_COMBINATION, HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 201, 202, 1001, 1002}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 6, HS_MODE_STREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + hs_stream_t *stream = nullptr; + err = hs_open_stream(db, 0, &stream); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(stream != nullptr); + + c.halt = 0; + int i; + for (i = 0; i < 11; i++) { + err = hs_scan_stream(stream, data[i].c_str(), data[i].size(), 0, + scratch, record_cb, (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + } + err = hs_close_stream(stream, scratch, dummy_cb, nullptr); + ASSERT_EQ(HS_SUCCESS, err); + + ASSERT_EQ(11U, c.matches.size()); + ASSERT_EQ(MatchRecord(10, 101), c.matches[0]); + ASSERT_EQ(MatchRecord(20, 102), c.matches[1]); + ASSERT_EQ(MatchRecord(20, 1001), c.matches[2]); + ASSERT_EQ(MatchRecord(109, 101), c.matches[3]); + ASSERT_EQ(MatchRecord(109, 1001), c.matches[4]); + ASSERT_EQ(MatchRecord(171, 102), c.matches[5]); + ASSERT_EQ(MatchRecord(171, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(247, 201), c.matches[7]); + ASSERT_EQ(MatchRecord(247, 1002), c.matches[8]); + ASSERT_EQ(MatchRecord(761, 201), c.matches[9]); + ASSERT_EQ(MatchRecord(761, 202), c.matches[10]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} From 4b1927c03847a8efafa25bc49adf1fe9fc78c401 Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Sun, 31 Mar 2019 11:16:50 +0800 Subject: [PATCH 10/18] Logical combination: add purely negative match at EOD unit test MultiCombPurelyNegativeUniSubEOD6. --- unit/hyperscan/logical_combination.cpp | 53 ++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/unit/hyperscan/logical_combination.cpp b/unit/hyperscan/logical_combination.cpp index 49854be1..9558948f 100644 --- a/unit/hyperscan/logical_combination.cpp +++ b/unit/hyperscan/logical_combination.cpp @@ -805,6 +805,59 @@ TEST(LogicalCombination, MultiCombPurelyNegativeUniSub6) { ASSERT_EQ(HS_SUCCESS, err); } +TEST(LogicalCombination, MultiCombPurelyNegativeUniSubEOD6) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + string data = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "-----------------------------------------------" + "xdefedxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "-----------------------------------------------" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "-------------------------------------defed"; + const char *expr[] = {"abc", "defed", "foobar.*gh", "teakettle{4,10}", + "ijkl[mMn]", "cba", "fed", "google.*cn", + "haystacks{4,8}", "ijkl[oOp]", "cab", "fee", + "goobar.*jp", "shockwave{4,6}", "ijkl[rRs]", + "(101 & 102 & 103) | (!104 & !105)", + "(!201 | 202 & 203) & (!204 | 205)", + "((301 | 302) & 303) & (304 | 305)"}; + unsigned flags[] = {0, 0, 0, 0, 0, 0, HS_FLAG_MULTILINE, + 0, 0, 0, 0, 0, 0, 0, 0, + HS_FLAG_COMBINATION, HS_FLAG_COMBINATION, + HS_FLAG_COMBINATION}; + unsigned ids[] = {101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 301, + 302, 303, 304, 305, 1001, 1002, 1003}; + hs_error_t err = hs_compile_multi(expr, flags, ids, 18, HS_MODE_NOSTREAM, + nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(8U, c.matches.size()); + ASSERT_EQ(MatchRecord(106, 102), c.matches[0]); + ASSERT_EQ(MatchRecord(106, 202), c.matches[1]); + ASSERT_EQ(MatchRecord(106, 1001), c.matches[2]); + ASSERT_EQ(MatchRecord(106, 1002), c.matches[3]); + ASSERT_EQ(MatchRecord(300, 102), c.matches[4]); + ASSERT_EQ(MatchRecord(300, 202), c.matches[5]); + ASSERT_EQ(MatchRecord(300, 1001), c.matches[6]); + ASSERT_EQ(MatchRecord(300, 1002), c.matches[7]); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + TEST(LogicalCombination, MultiCombStream1) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; From 8bfbf07f75411924626910970de89405ce2a9605 Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Thu, 20 Jun 2019 08:59:18 +0800 Subject: [PATCH 11/18] Do not free stream unless hs_close_stream returns success. (by unit-hyperscan HyperscanArgChecks.CloseStreamNoScratch) --- src/runtime.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/runtime.c b/src/runtime.c index 078c8821..ed1eaf53 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -1006,17 +1006,14 @@ hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch, if (onEvent) { if (!scratch || !validScratch(id->rose, scratch)) { - hs_stream_free(id); return HS_INVALID; } if (unlikely(markScratchInUse(scratch))) { - hs_stream_free(id); return HS_SCRATCH_IN_USE; } report_eod_matches(id, scratch, onEvent, context); if (unlikely(internal_matching_error(scratch))) { unmarkScratchInUse(scratch); - hs_stream_free(id); return HS_UNKNOWN_ERROR; } unmarkScratchInUse(scratch); From 23e5f06594c8ecb6951e1de4b531af8a1c29a3cd Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 18 Jul 2019 00:29:27 +0800 Subject: [PATCH 12/18] add new Literal API for pure literal expressions: Design compile time api hs_compile_lit() and hs_compile_lit_multi() to handle pure literal pattern sets. Corresponding option --literal-on is added for hyperscan testing suites. Extended parameters and part of flags are not supported for this api. --- src/compiler/compiler.cpp | 92 +++++++++++- src/compiler/compiler.h | 27 +++- src/fdr/fdr_compile.cpp | 3 - src/fdr/fdr_confirm.h | 1 - src/fdr/fdr_confirm_compile.cpp | 1 - src/fdr/fdr_confirm_runtime.h | 2 - src/hs.cpp | 163 +++++++++++++++++++- src/hs_compile.h | 181 ++++++++++++++++++++++- src/hs_internal.h | 13 +- src/hwlm/hwlm_literal.cpp | 5 +- src/hwlm/hwlm_literal.h | 5 +- src/parser/shortcut_literal.cpp | 1 - src/rose/block.c | 2 +- src/rose/match.c | 14 +- src/rose/program_runtime.c | 80 ++++++++++ src/rose/rose_build_bytecode.cpp | 31 +--- src/rose/rose_build_impl.h | 9 +- src/rose/rose_build_matchers.cpp | 3 +- src/rose/rose_internal.h | 1 + src/runtime.c | 1 - src/scratch.c | 1 - src/scratch.h | 1 - src/util/ue2string.cpp | 3 +- src/util/ue2string.h | 7 - tools/hsbench/common.h | 3 +- tools/hsbench/engine_hyperscan.cpp | 36 +++-- tools/hsbench/main.cpp | 8 +- tools/hscheck/main.cpp | 45 +++++- tools/hscollider/GroundTruth.cpp | 31 +++- tools/hscollider/NfaGeneratedCorpora.cpp | 16 +- tools/hscollider/UltimateTruth.cpp | 21 ++- tools/hscollider/args.cpp | 6 +- tools/hscollider/common.h | 3 +- tools/hscollider/main.cpp | 3 +- tools/hsdump/main.cpp | 26 +++- util/string_util.h | 16 +- 36 files changed, 745 insertions(+), 116 deletions(-) diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index a34eadd0..3382ff42 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -56,11 +56,13 @@ #include "parser/unsupported.h" #include "parser/utf8_validate.h" #include "rose/rose_build.h" +#include "rose/rose_internal.h" #include "som/slot_manager_dump.h" #include "util/bytecode_ptr.h" #include "util/compile_error.h" #include "util/target_info.h" #include "util/verify_types.h" +#include "util/ue2string.h" #include #include @@ -107,6 +109,46 @@ void validateExt(const hs_expr_ext &ext) { } +void ParsedLitExpression::parseLiteral(const char *expression, size_t len, + bool nocase) { + const char *c = expression; + for (size_t i = 0; i < len; i++) { + lit.push_back(*c, nocase); + c++; + } +} + +ParsedLitExpression::ParsedLitExpression(unsigned index_in, + const char *expression, + size_t expLength, unsigned flags, + ReportID report) + : expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false, + SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) { + // For pure literal expression, below 'HS_FLAG_'s are unuseful: + // DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET + + if (flags & ~HS_FLAG_ALL) { + DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags); + throw CompileError("Unrecognised flag."); + } + + // FIXME: we disallow highlander + SOM, see UE-1850. + if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) { + throw CompileError("HS_FLAG_SINGLEMATCH is not supported in " + "combination with HS_FLAG_SOM_LEFTMOST."); + } + + // Set SOM type. + if (flags & HS_FLAG_SOM_LEFTMOST) { + expr.som = SOM_LEFT; + } + + // Transfer expression text into ue2_literal. + bool nocase = flags & HS_FLAG_CASELESS ? true : false; + parseLiteral(expression, expLength, nocase); + +} + ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, unsigned flags, ReportID report, const hs_expr_ext *ext) @@ -345,6 +387,49 @@ void addExpression(NG &ng, unsigned index, const char *expression, } } +void addLitExpression(NG &ng, unsigned index, const char *expression, + unsigned flags, const hs_expr_ext *ext, ReportID id, + size_t expLength) { + assert(expression); + const CompileContext &cc = ng.cc; + DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s', len='%zu'\n", index, + id, flags, expression, expLength); + + // Extended parameters are not supported for pure literal patterns. + if (ext && ext->flags != 0LLU) { + throw CompileError("Extended parameters are not supported for pure " + "literal matching API."); + } + + // Ensure that our pattern isn't too long (in characters). + if (strlen(expression) > cc.grey.limitPatternLength) { + throw CompileError("Pattern length exceeds limit."); + } + + // filter out flags not supported by pure literal API. + u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 | + HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION | + HS_FLAG_QUIET; + + if (flags & not_supported) { + throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, " + "HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are " + "supported in literal API."); + } + + // This expression must be a pure literal, we can build ue2_literal + // directly based on expression text. + ParsedLitExpression ple(index, expression, expLength, flags, id); + + // Feed the ue2_literal into Rose. + const auto &expr = ple.expr; + if (ng.addLiteral(ple.lit, expr.index, expr.report, expr.highlander, + expr.som, expr.quiet)) { + DEBUG_PRINTF("took pure literal\n"); + return; + } +} + static bytecode_ptr generateRoseEngine(NG &ng) { const u32 minWidth = @@ -416,10 +501,13 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) { } -struct hs_database *build(NG &ng, unsigned int *length) { +struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) { assert(length); auto rose = generateRoseEngine(ng); + struct RoseEngine *roseHead = rose.get(); + roseHead->pureLiteral = pureFlag; + if (!rose) { throw CompileError("Unable to generate bytecode."); } diff --git a/src/compiler/compiler.h b/src/compiler/compiler.h index 60d7ca33..b42cb142 100644 --- a/src/compiler/compiler.h +++ b/src/compiler/compiler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include "compiler/expression_info.h" #include "parser/Component.h" #include "util/noncopyable.h" +#include "util/ue2string.h" #include @@ -66,6 +67,22 @@ public: std::unique_ptr component; }; + +/** \brief Class gathering together the pieces of a parsed lit-expression. */ +class ParsedLitExpression : noncopyable { +public: + ParsedLitExpression(unsigned index, const char *expression, + size_t expLength, unsigned flags, ReportID report); + + void parseLiteral(const char *expression, size_t len, bool nocase); + + /** \brief Expression information (from flags, extparam etc) */ + ExpressionInfo expr; + + /** \brief Format the lit-expression text into Hyperscan literal type. */ + ue2_literal lit; +}; + /** * \brief Class gathering together the pieces of an expression that has been * built into an NFA graph. @@ -99,6 +116,10 @@ struct BuiltExpression { void addExpression(NG &ng, unsigned index, const char *expression, unsigned flags, const hs_expr_ext *ext, ReportID report); +void addLitExpression(NG &ng, unsigned index, const char *expression, + unsigned flags, const hs_expr_ext *ext, ReportID id, + size_t expLength); + /** * Build a Hyperscan database out of the expressions we've been given. A * fatal error will result in an exception being thrown. @@ -107,11 +128,13 @@ void addExpression(NG &ng, unsigned index, const char *expression, * The global NG object. * @param[out] length * The number of bytes occupied by the compiled structure. + * @param pureFlag + * The flag indicating invocation from literal API or not. * @return * The compiled structure. Should be deallocated with the * hs_database_free() function. */ -struct hs_database *build(NG &ng, unsigned int *length); +struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag); /** * Constructs an NFA graph from the given expression tree. diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index 39cbc335..fcfc0863 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -807,9 +807,6 @@ void findIncludedLits(vector &lits, for (size_t i = 0; i < cnt; i++) { u32 bucket1 = group[i].first; u32 id1 = group[i].second; - if (lits[id1].pure) { - continue; - } buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map, exception_map); } diff --git a/src/fdr/fdr_confirm.h b/src/fdr/fdr_confirm.h index 9490df43..a23082cc 100644 --- a/src/fdr/fdr_confirm.h +++ b/src/fdr/fdr_confirm.h @@ -62,7 +62,6 @@ struct LitInfo { u8 size; u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above. u8 next; - u8 pure; //!< The pass-on of pure flag from hwlmLiteral. }; #define FDRC_FLAG_NO_CONFIRM 1 diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp index 3eab21b2..8e369089 100644 --- a/src/fdr/fdr_confirm_compile.cpp +++ b/src/fdr/fdr_confirm_compile.cpp @@ -87,7 +87,6 @@ void fillLitInfo(const vector &lits, vector &tmpLitInfo, info.flags = flags; info.size = verify_u8(max(lit.msk.size(), lit.s.size())); info.groups = lit.groups; - info.pure = lit.pure; // these are built up assuming a LE machine CONF_TYPE msk = all_ones; diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h index 67e0d692..5a216495 100644 --- a/src/fdr/fdr_confirm_runtime.h +++ b/src/fdr/fdr_confirm_runtime.h @@ -65,7 +65,6 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a u8 oldNext; // initialized in loop do { assert(ISALIGNED(li)); - scratch->pure = li->pure; if (unlikely((conf_key & li->msk) != li->v)) { goto out; @@ -100,7 +99,6 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a li++; } while (oldNext); scratch->fdr_conf = NULL; - scratch->pure = 0; } #endif diff --git a/src/hs.cpp b/src/hs.cpp index 329702d4..ab54105c 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -251,7 +251,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags, ng.rm.logicalKeyRenumber(); unsigned length = 0; - struct hs_database *out = build(ng, &length); + struct hs_database *out = build(ng, &length, 0); assert(out); // should have thrown exception on error assert(length); @@ -281,6 +281,130 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags, } } +hs_error_t +hs_compile_lit_multi_int(const char *const *expressions, const unsigned *flags, + const unsigned *ids, const hs_expr_ext *const *ext, + const size_t *lens, unsigned elements, unsigned mode, + const hs_platform_info_t *platform, hs_database_t **db, + hs_compile_error_t **comp_error, const Grey &g) { + // Check the args: note that it's OK for flags, ids or ext to be null. + if (!comp_error) { + if (db) { + *db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return HS_COMPILER_ERROR; + } + if (!db) { + *comp_error = generateCompileError("Invalid parameter: db is NULL", -1); + return HS_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error + = generateCompileError("Invalid parameter: expressions is NULL", + -1); + return HS_COMPILER_ERROR; + } + if (!lens) { + *db = nullptr; + *comp_error = generateCompileError("Invalid parameter: len is NULL", -1); + return HS_COMPILER_ERROR; + } + if (elements == 0) { + *db = nullptr; + *comp_error = generateCompileError("Invalid parameter: elements is zero", -1); + return HS_COMPILER_ERROR; + } + +#if defined(FAT_RUNTIME) + if (!check_ssse3()) { + *db = nullptr; + *comp_error = generateCompileError("Unsupported architecture", -1); + return HS_ARCH_ERROR; + } +#endif + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode. + return HS_COMPILER_ERROR; + } + + if (!checkPlatform(platform, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkPlattform. + return HS_COMPILER_ERROR; + } + + if (elements > g.limitPatternCount) { + *db = nullptr; + *comp_error = generateCompileError("Number of patterns too large", -1); + return HS_COMPILER_ERROR; + } + + // This function is simply a wrapper around both the parser and compiler + bool isStreaming = mode & (HS_MODE_STREAM | HS_MODE_VECTORED); + bool isVectored = mode & HS_MODE_VECTORED; + unsigned somPrecision = getSomPrecision(mode); + + target_t target_info = platform ? target_t(*platform) + : get_current_target(); + + try { + CompileContext cc(isStreaming, isVectored, target_info, g); + NG ng(cc, elements, somPrecision); + + for (unsigned int i = 0; i < elements; i++) { + // Add this expression to the compiler + try { + addLitExpression(ng, i, expressions[i], flags ? flags[i] : 0, + ext ? ext[i] : nullptr, ids ? ids[i] : 0, + lens[i]); + } catch (CompileError &e) { + /* Caught a parse error; + * throw it upstream as a CompileError with a specific index */ + e.setExpressionIndex(i); + throw; /* do not slice */ + } + } + + // Check sub-expression ids + ng.rm.pl.validateSubIDs(ids, expressions, flags, elements); + // Renumber and assign lkey to reports + ng.rm.logicalKeyRenumber(); + + unsigned length = 0; + struct hs_database *out = build(ng, &length, 1); + + assert(out); //should have thrown exception on error + assert(length); + + *db = out; + *comp_error = nullptr; + + return HS_SUCCESS; + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateCompileError(e.reason, + e.hasIndex ? (int)e.index : -1); + return HS_COMPILER_ERROR; + } + catch (const std::bad_alloc &) { + *db = nullptr; + *comp_error = const_cast(&hs_enomem); + return HS_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal errror, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&hs_einternal); + return HS_COMPILER_ERROR; + } +} + } // namespace ue2 extern "C" HS_PUBLIC_API @@ -326,6 +450,41 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char * const *expressions, platform, db, error, Grey()); } +extern "C" HS_PUBLIC_API +hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags, + const size_t len, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error) { + if (expression == nullptr) { + *db = nullptr; + *error = generateCompileError("Invalid parameter: expression is NULL", + -1); + return HS_COMPILER_ERROR; + } + + unsigned id = 0; // single expressions get zero as an ID + const hs_expr_ext * const *ext = nullptr; // unused for this call. + + return hs_compile_lit_multi_int(&expression, &flags, &id, ext, &len, 1, + mode, platform, db, error, Grey()); +} + +extern "C" HS_PUBLIC_API +hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions, + const unsigned *flags, + const unsigned *ids, + const size_t *lens, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error) { + const hs_expr_ext * const *ext = nullptr; // unused for this call. + return hs_compile_lit_multi_int(expressions, flags, ids, ext, lens, + elements, mode, platform, db, error, + Grey()); +} + static hs_error_t hs_expression_info_int(const char *expression, unsigned int flags, const hs_expr_ext_t *ext, unsigned int mode, diff --git a/src/hs_compile.h b/src/hs_compile.h index c8dcfdf2..4c372ffe 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -323,6 +323,10 @@ typedef struct hs_expr_ext { * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param mode * Compiler mode flags that affect the database as a whole. One of @ref @@ -392,6 +396,10 @@ hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ids * An array of integers specifying the ID number to be associated with the @@ -472,6 +480,10 @@ hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ids * An array of integers specifying the ID number to be associated with the @@ -527,6 +539,165 @@ hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions, const hs_platform_info_t *platform, hs_database_t **db, hs_compile_error_t **error); +/** + * The basic pure literal expression compiler. + * + * This is the function call with which a pure literal expression (not a + * common regular expression) is compiled into a Hyperscan database which + * can be passed to the runtime functions (such as @ref hs_scan(), + * @ref hs_open_stream(), etc.) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Compared to @ref hs_compile(), fewer + * valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param len + * The length of the text content of the pure literal expression. As the + * text content indicated by @p expression is treated as single character + * one by one, the special terminating character `\0` should be allowed + * to appear in expression, and not treated as a terminator for a string. + * Thus, the end of a pure literal expression cannot be indicated by + * identifying `\0`, but by counting to the expression length. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags, + const size_t len, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); +/** + * The multiple pure literal expression compiler. + * + * This is the function call with which a set of pure literal expressions is + * compiled into a database which can be passed to the runtime functions (such + * as @ref hs_scan(), @ref hs_open_stream(), etc.) Each expression can be + * labelled with a unique integer which is passed into the match callback to + * identify the pattern that has matched. + * + * @param expressions + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Compared to @ref hs_compile_multi(), fewer valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param lens + * Array of lengths of the text content of each pure literal expression. + * As the text content indicated by @p expression is treated as single + * character one by one, the special terminating character `\0` should be + * allowed to appear in expression, and not treated as a terminator for a + * string. Thus, the end of a pure literal expression cannot be indicated + * by identifying `\0`, but by counting to the expression length. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions, + const unsigned *flags, + const unsigned *ids, + const size_t *lens, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); + /** * Free an error structure generated by @ref hs_compile(), @ref * hs_compile_multi() or @ref hs_compile_ext_multi(). @@ -579,6 +750,10 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error); * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param info * On success, a pointer to the pattern information will be returned in @@ -641,6 +816,10 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. * * @param ext * A pointer to a filled @ref hs_expr_ext_t structure that defines diff --git a/src/hs_internal.h b/src/hs_internal.h index 2a00fa2f..adf07b22 100644 --- a/src/hs_internal.h +++ b/src/hs_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -52,6 +52,17 @@ hs_error_t hs_compile_multi_int(const char *const *expressions, hs_database_t **db, hs_compile_error_t **comp_error, const Grey &g); +/** \brief Internal use only: takes a Grey argument so that we can use it in + * tools. */ +hs_error_t hs_compile_lit_multi_int(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + const hs_expr_ext *const *ext, + const size_t *lens, unsigned elements, + unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **comp_error, + const Grey &g); } // namespace ue2 extern "C" diff --git a/src/hwlm/hwlm_literal.cpp b/src/hwlm/hwlm_literal.cpp index b257dfb0..692f7c6c 100644 --- a/src/hwlm/hwlm_literal.cpp +++ b/src/hwlm/hwlm_literal.cpp @@ -83,10 +83,9 @@ bool maskIsConsistent(const std::string &s, bool nocase, const vector &msk, * \ref HWLM_MASKLEN. */ hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, u32 id_in, hwlm_group_t groups_in, - const vector &msk_in, const vector &cmp_in, - bool pure_in) + const vector &msk_in, const vector &cmp_in) : s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in), - groups(groups_in), msk(msk_in), cmp(cmp_in), pure(pure_in) { + groups(groups_in), msk(msk_in), cmp(cmp_in) { assert(s.size() <= HWLM_LITERAL_MAX_LEN); assert(msk.size() <= HWLM_MASKLEN); assert(msk.size() == cmp.size()); diff --git a/src/hwlm/hwlm_literal.h b/src/hwlm/hwlm_literal.h index 72a57f94..598de814 100644 --- a/src/hwlm/hwlm_literal.h +++ b/src/hwlm/hwlm_literal.h @@ -113,16 +113,13 @@ struct hwlmLiteral { */ std::vector cmp; - bool pure; //!< \brief The pass-on of pure flag from LitFragment. - /** \brief Complete constructor, takes group information and msk/cmp. * * This constructor takes a msk/cmp pair. Both must be vectors of length <= * \ref HWLM_MASKLEN. */ hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, u32 id_in, hwlm_group_t groups_in, - const std::vector &msk_in, const std::vector &cmp_in, - bool pure_in = false); + const std::vector &msk_in, const std::vector &cmp_in); /** \brief Simple constructor: no group information, no msk/cmp. * diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp index d08bab3c..a5d67f30 100644 --- a/src/parser/shortcut_literal.cpp +++ b/src/parser/shortcut_literal.cpp @@ -185,7 +185,6 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { return false; } - vis.lit.set_pure(); const ue2_literal &lit = vis.lit; if (lit.empty()) { diff --git a/src/rose/block.c b/src/rose/block.c index a32113f4..b3f424cb 100644 --- a/src/rose/block.c +++ b/src/rose/block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/src/rose/match.c b/src/rose/match.c index c91b2a50..84d3b1fd 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -238,10 +238,10 @@ hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t, assert(id && id < t->size); // id is an offset into bytecode const u64a som = 0; const u8 flags = 0; - if (!scratch->pure) { - return roseRunProgram(t, scratch, id, som, end, flags); - } else { + if (t->pureLiteral) { return roseRunProgram_l(t, scratch, id, som, end, flags); + } else { + return roseRunProgram(t, scratch, id, som, end, flags); } } @@ -619,8 +619,12 @@ int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) { // Our match ID is the program offset. const u32 program = id; const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; - hwlmcb_rv_t rv = - roseRunProgram(rose, scratch, program, start, end, flags); + hwlmcb_rv_t rv; + if (rose->pureLiteral) { + rv = roseRunProgram_l(rose, scratch, program, start, end, flags); + } else { + rv = roseRunProgram(rose, scratch, program, start, end, flags); + } if (rv == HWLM_TERMINATE_MATCHING) { return MO_HALT_MATCHING; } diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 4238f2e4..0f2d1083 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -2884,6 +2884,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, assert(programOffset >= sizeof(struct RoseEngine)); assert(programOffset < t->size); + const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP; const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV; const char *pc_base = getByOffset(t, programOffset); @@ -2911,6 +2912,56 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(CHECK_GROUPS) { + DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n", + tctxt->groups, ri->groups); + if (!(ri->groups & tctxt->groups)) { + DEBUG_PRINTF("halt: no groups are set\n"); + return HWLM_CONTINUE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + DEBUG_PRINTF("failed mask check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK_32) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_BYTE) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, + ri->negation, ri->offset, end)) { + DEBUG_PRINTF("failed byte check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(PUSH_DELAYED) { + rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end); + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(CATCH_UP) { if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; @@ -2967,6 +3018,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(REPORT_CHAIN) { + // Note: sequence points updated inside this function. + if (roseCatchUpAndHandleChainMatch( + t, scratch, ri->event, ri->top_squash_distance, end, + in_catchup) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(REPORT) { updateSeqPoint(tctxt, end, from_mpv); if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, @@ -3117,6 +3179,24 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(INCLUDED_JUMP) { + if (scratch->fdr_conf) { + // squash the bucket of included literal + u8 shift = scratch->fdr_conf_offset & ~7U; + u64a mask = ((~(u64a)ri->squash) << shift); + *(scratch->fdr_conf) &= mask; + + pc = getByOffset(t, ri->child_offset); + pc_base = pc; + programOffset = (const u8 *)pc_base -(const u8 *)t; + DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n", + pc_base, pc, ri->child_offset, ri->squash); + work_done = 0; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(SET_LOGICAL) { DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n", ri->lkey, ri->offset_adjust); diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 908d13c1..5cbb5c84 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -2843,34 +2843,9 @@ vector groupByFragment(const RoseBuildImpl &build) { DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id, dumpString(lit.s).c_str()); - - /** 0:/xxabcdefgh/ */ - /** 1:/yyabcdefgh/ */ - /** 2:/yyabcdefgh.+/ */ - // Above 3 patterns should firstly convert into RoseLiteralMap with - // 2 elements ("xxabcdefgh" and "yyabcdefgh"), then convert into - // LitFragment with 1 element ("abcdefgh"). Special care should be - // taken to handle the 'pure' flag during the conversion. - - rose_literal_id lit_frag = getFragment(lit); - auto it = frag_info.find(lit_frag); - if (it != frag_info.end()) { - if (!lit_frag.s.get_pure() && it->first.s.get_pure()) { - struct FragmentInfo f_info = it->second; - f_info.lit_ids.push_back(lit_id); - f_info.groups |= groups; - frag_info.erase(it->first); - frag_info.emplace(lit_frag, f_info); - } else { - it->second.lit_ids.push_back(lit_id); - it->second.groups |= groups; - } - } else { - struct FragmentInfo f_info; - f_info.lit_ids.push_back(lit_id); - f_info.groups |= groups; - frag_info.emplace(lit_frag, f_info); - } + auto &fi = frag_info[getFragment(lit)]; + fi.lit_ids.push_back(lit_id); + fi.groups |= groups; } for (auto &m : frag_info) { diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h index fe48da4c..7780848b 100644 --- a/src/rose/rose_build_impl.h +++ b/src/rose/rose_build_impl.h @@ -340,14 +340,7 @@ public: std::pair insert(const rose_literal_id &lit) { auto it = lits_index.find(lit); if (it != lits_index.end()) { - u32 idx = it->second; - auto &l = lits.at(idx); - if (!lit.s.get_pure() && l.s.get_pure()) { - lits_index.erase(l); - l.s.unset_pure(); - lits_index.emplace(l, idx); - } - return {idx, false}; + return {it->second, false}; } u32 id = verify_u32(lits.size()); lits.push_back(lit); diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp index 8c532cab..4fde4c44 100644 --- a/src/rose/rose_build_matchers.cpp +++ b/src/rose/rose_build_matchers.cpp @@ -727,7 +727,6 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp, const auto &s_final = lit_final.get_string(); bool nocase = lit_final.any_nocase(); - bool pure = f.s.get_pure(); DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n", f.fragment_id, escapeString(s_final).c_str(), (int)nocase, @@ -741,7 +740,7 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp, const auto &groups = f.groups; mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id, - groups, msk, cmp, pure); + groups, msk, cmp); } static diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index ff24a9cc..7bd6779c 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -328,6 +328,7 @@ struct RoseBoundaryReports { * nfas). Rose nfa info table can distinguish the cases. */ struct RoseEngine { + u8 pureLiteral; /* Indicator of pure literal API */ u8 noFloatingRoots; /* only need to run the anchored table if something * matched in the anchored table */ u8 requiresEodCheck; /* stuff happens at eod time */ diff --git a/src/runtime.c b/src/runtime.c index ed1eaf53..a3659348 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -141,7 +141,6 @@ void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose, s->deduper.current_report_offset = ~0ULL; s->deduper.som_log_dirty = 1; /* som logs have not been cleared */ s->fdr_conf = NULL; - s->pure = 0; // Rose program execution (used for some report paths) depends on these // values being initialised. diff --git a/src/scratch.c b/src/scratch.c index c23b5b3c..b4630640 100644 --- a/src/scratch.c +++ b/src/scratch.c @@ -137,7 +137,6 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { s->scratchSize = alloc_size; s->scratch_alloc = (char *)s_tmp; s->fdr_conf = NULL; - s->pure = 0; // each of these is at an offset from the previous char *current = (char *)s + sizeof(*s); diff --git a/src/scratch.h b/src/scratch.h index e2e8039a..1256f7ab 100644 --- a/src/scratch.h +++ b/src/scratch.h @@ -211,7 +211,6 @@ struct ALIGN_CL_DIRECTIVE hs_scratch { u64a *fdr_conf; /**< FDR confirm value */ u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches * in buffer */ - u8 pure; /**< indicator of pure-literal or cutting-literal */ }; /* array of fatbit ptr; TODO: why not an array of fatbits? */ diff --git a/src/util/ue2string.cpp b/src/util/ue2string.cpp index 98b007d4..50b2bbcc 100644 --- a/src/util/ue2string.cpp +++ b/src/util/ue2string.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -283,7 +283,6 @@ ue2_literal &ue2_literal::erase(size_type pos, size_type n) { } void ue2_literal::push_back(char c, bool nc) { - assert(!nc || ourisalpha(c)); if (nc) { c = mytoupper(c); } diff --git a/src/util/ue2string.h b/src/util/ue2string.h index 1ce51b2f..0aa84689 100644 --- a/src/util/ue2string.h +++ b/src/util/ue2string.h @@ -211,17 +211,10 @@ public: size_t hash() const; - void set_pure() { pure = true; } - void unset_pure() { pure = false; } - bool get_pure() const { return pure; } - - /* TODO: consider existing member functions possibly related with pure. */ - private: friend const_iterator; std::string s; boost::dynamic_bitset<> nocase; - bool pure = false; /**< born from cutting or not (pure literal). */ }; /// Return a reversed copy of this literal. diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h index 820cad7c..7c2c8f9d 100644 --- a/tools/hsbench/common.h +++ b/tools/hsbench/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,6 +41,7 @@ extern unsigned int somPrecisionMode; extern bool forceEditDistance; extern unsigned editDistance; extern bool printCompressSize; +extern bool useLiteralApi; /** Structure for the result of a single complete scan. */ struct ResultEntry { diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp index 3390c263..c1f1e8c4 100644 --- a/tools/hsbench/engine_hyperscan.cpp +++ b/tools/hsbench/engine_hyperscan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -411,22 +411,30 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, ext_ptr[i] = &ext[i]; } - Timer timer; - timer.start(); - hs_compile_error_t *compile_err; + Timer timer; -#ifndef RELEASE_BUILD - err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(), - ext_ptr.data(), count, full_mode, nullptr, - &db, &compile_err, grey); -#else - err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.data(), - ext_ptr.data(), count, full_mode, nullptr, - &db, &compile_err); -#endif + if (useLiteralApi) { + // Pattern length computation should be done before timer start. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + timer.start(); + err = hs_compile_lit_multi_int(patterns.data(), flags.data(), + ids.data(), ext_ptr.data(), + lens.data(), count, full_mode, + nullptr, &db, &compile_err, grey); + timer.complete(); + } else { + timer.start(); + err = hs_compile_multi_int(patterns.data(), flags.data(), + ids.data(), ext_ptr.data(), count, + full_mode, nullptr, &db, &compile_err, + grey); + timer.complete(); + } - timer.complete(); compileSecs = timer.seconds(); peakMemorySize = getPeakHeap(); diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp index de9fde07..8e85d7ae 100644 --- a/tools/hsbench/main.cpp +++ b/tools/hsbench/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -87,6 +87,7 @@ unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; bool forceEditDistance = false; unsigned editDistance = 0; bool printCompressSize = false; +bool useLiteralApi = false; // Globals local to this file. static bool compressStream = false; @@ -218,6 +219,7 @@ void usage(const char *error) { printf(" --per-scan Display per-scan Mbit/sec results.\n"); printf(" --echo-matches Display all matches that occur during scan.\n"); printf(" --sql-out FILE Output sqlite db.\n"); + printf(" --literal-on Use Hyperscan pure literal matching.\n"); printf(" -S NAME Signature set name (for sqlite db).\n"); printf("\n\n"); @@ -250,6 +252,7 @@ void processArgs(int argc, char *argv[], vector &sigSets, int do_echo_matches = 0; int do_sql_output = 0; int option_index = 0; + int literalFlag = 0; vector sigFiles; static struct option longopts[] = { @@ -257,6 +260,7 @@ void processArgs(int argc, char *argv[], vector &sigSets, {"echo-matches", no_argument, &do_echo_matches, 1}, {"compress-stream", no_argument, &do_compress, 1}, {"sql-out", required_argument, &do_sql_output, 1}, + {"literal-on", no_argument, &literalFlag, 1}, {nullptr, 0, nullptr, 0} }; @@ -463,6 +467,8 @@ void processArgs(int argc, char *argv[], vector &sigSets, loadSignatureList(file, sigs); sigSets.emplace_back(file, move(sigs)); } + + useLiteralApi = (bool)literalFlag; } /** Start the global timer. */ diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp index 595c8b84..9cfe73df 100644 --- a/tools/hscheck/main.cpp +++ b/tools/hscheck/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -92,6 +92,7 @@ bool g_allSignatures = false; bool g_forceEditDistance = false; bool build_sigs = false; bool check_logical = false; +bool use_literal_api = false; unsigned int g_signature; unsigned int g_editDistance; unsigned int globalFlags = 0; @@ -322,11 +323,26 @@ void checkExpression(UNUSED void *threadarg) { #if !defined(RELEASE_BUILD) // This variant is available in non-release builds and allows us to // modify greybox settings. - err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err, *g_grey); + if (use_literal_api) { + size_t len = strlen(regexp); + err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp, + &len, 1, mode, nullptr, &db, + &compile_err, *g_grey); + } else { + err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, + mode, nullptr, &db, &compile_err, + *g_grey); + } #else - err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err); + if (use_literal_api) { + size_t len = strlen(regexp); + err = hs_compile_lit_multi_int(®exp, &flags, nullptr, &extp, + &len, 1, mode, nullptr, &db, + &compile_err, *g_grey); + } else { + err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, + mode, nullptr, &db, &compile_err); + } #endif if (err == HS_SUCCESS) { @@ -381,6 +397,11 @@ void checkLogicalExpression(UNUSED void *threadarg) { ExprExtMap::const_iterator it; while (getNextLogicalExpression(it)) { + if (use_literal_api) { + recordSuccess(g_exprMap, it->first); + continue; + } + const ParsedExpr &comb = it->second; vector subIds; @@ -470,6 +491,7 @@ void usage() { << " -h Display this help." << endl << " -B Build signature set." << endl << " -C Check logical combinations (default: off)." << endl + << " --literal-on Processing pure literals, no need to check." << endl << endl; } @@ -477,9 +499,15 @@ static void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { const char options[] = "e:E:s:z:hHLNV8G:T:BC"; bool signatureSet = false; + int literalFlag = 0; + + static struct option longopts[] = { + {"literal-on", no_argument, &literalFlag, 1}, + {nullptr, 0, nullptr, 0} + }; for (;;) { - int c = getopt_long(argc, argv, options, nullptr, nullptr); + int c = getopt_long(argc, argv, options, longopts, nullptr); if (c < 0) { break; } @@ -539,6 +567,9 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { case 'C': check_logical = true; break; + case 0: + case 1: + break; default: usage(); exit(1); @@ -564,6 +595,8 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { usage(); exit(1); } + + use_literal_api = (bool)literalFlag; } static diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp index 5a4bdc00..f30a8f5e 100644 --- a/tools/hscollider/GroundTruth.cpp +++ b/tools/hscollider/GroundTruth.cpp @@ -43,6 +43,7 @@ #include "parser/Parser.h" #include "parser/parse_error.h" #include "util/make_unique.h" +#include "util/string_util.h" #include "util/unicode_def.h" #include "util/unordered.h" @@ -111,6 +112,15 @@ bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander, return false; } + if (use_literal_api) { + // filter out flags not supported by pure literal API. + u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 | + HS_FLAG_UCP | HS_FLAG_PREFILTER; + hs_flags &= ~not_supported; + force_utf8 = false; + force_prefilter = false; + } + expr.swap(regex); if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som, @@ -260,9 +270,29 @@ GroundTruth::compile(unsigned id, bool no_callouts) { throw PcreCompileFailure("Unable to decode flags."); } + // When hyperscan literal api is on, transfer the regex string into hex. + if (use_literal_api && !combination) { + unsigned char *pat + = reinterpret_cast(const_cast(re.c_str())); + char *str = makeHex(pat, re.length()); + if (!str) { + throw PcreCompileFailure("makeHex() malloc failure."); + } + re.assign(str); + free(str); + } + // filter out flags not supported by PCRE u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH; + if (use_literal_api) { + ext.flags &= 0ULL; + ext.min_offset = 0; + ext.max_offset = MAX_OFFSET; + ext.min_length = 0; + ext.edit_distance = 0; + ext.hamming_distance = 0; + } if (ext.flags & ~supported) { // edit distance is a known unsupported flag, so just throw a soft error if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) { @@ -314,7 +344,6 @@ GroundTruth::compile(unsigned id, bool no_callouts) { return compiled; } - compiled->bytecode = pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr); diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp index b7c77ee1..66ae270b 100644 --- a/tools/hscollider/NfaGeneratedCorpora.cpp +++ b/tools/hscollider/NfaGeneratedCorpora.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -32,6 +32,7 @@ #include "ng_corpus_generator.h" #include "NfaGeneratedCorpora.h" #include "ExpressionParser.h" +#include "common.h" #include "grey.h" #include "hs_compile.h" @@ -44,6 +45,7 @@ #include "util/compile_context.h" #include "util/compile_error.h" #include "util/report_manager.h" +#include "util/string_util.h" #include "util/target_info.h" #include @@ -80,6 +82,18 @@ void NfaGeneratedCorpora::generate(unsigned id, vector &data) { throw CorpusFailure("Expression could not be read: " + i->second); } + // When hyperscan literal api is on, transfer the regex string into hex. + if (use_literal_api && !(hs_flags & HS_FLAG_COMBINATION)) { + unsigned char *pat + = reinterpret_cast(const_cast(re.c_str())); + char *str = makeHex(pat, re.length()); + if (!str) { + throw CorpusFailure("makeHex() malloc failure."); + } + re.assign(str); + free(str); + } + // Combination's corpus is consist of sub-expressions' corpuses. if (hs_flags & HS_FLAG_COMBINATION) { ParsedLogical pl; diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp index c37e39ba..038fbf77 100644 --- a/tools/hscollider/UltimateTruth.cpp +++ b/tools/hscollider/UltimateTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -925,11 +925,22 @@ compileHyperscan(vector &patterns, vector &flags, const unsigned count = patterns.size(); hs_database_t *db = nullptr; hs_compile_error_t *compile_err; + hs_error_t err; - hs_error_t err = hs_compile_multi_int(&patterns[0], &flags[0], - &idsvec[0], ext.c_array(), count, - mode, platform, &db, - &compile_err, grey); + if (use_literal_api) { + // Compute length of each pattern. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + err = hs_compile_lit_multi_int(&patterns[0], &flags[0], &idsvec[0], + ext.c_array(), &lens[0], count, mode, + platform, &db, &compile_err, grey); + } else { + err = hs_compile_multi_int(&patterns[0], &flags[0], &idsvec[0], + ext.c_array(), count, mode, platform, &db, + &compile_err, grey); + } if (err != HS_SUCCESS) { error = compile_err->message; diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp index 3b515027..2eb510e0 100644 --- a/tools/hscollider/args.cpp +++ b/tools/hscollider/args.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -116,6 +116,7 @@ void usage(const char *name, const char *error) { printf(" --abort-on-fail Abort, rather than exit, on failure.\n"); printf(" --no-signal-handler Do not handle handle signals (to generate " "backtraces).\n"); + printf(" --literal-on Use Hyperscan pure literal matching.\n"); printf("\n"); printf("Memory and resource control options:\n"); printf("\n"); @@ -174,6 +175,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, int mangleScratch = 0; int compressFlag = 0; int compressResetFlag = 0; + int literalFlag = 0; static const struct option longopts[] = { {"copy-scratch", 0, ©Scratch, 1}, {"copy-stream", 0, ©Stream, 1}, @@ -187,6 +189,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, {"compress-expand", 0, &compressFlag, 1}, {"compress-reset-expand", 0, &compressResetFlag, 1}, {"no-groups", 0, &no_groups, 1}, + {"literal-on", 0, &literalFlag, 1}, {nullptr, 0, nullptr, 0}}; for (;;) { @@ -589,4 +592,5 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, use_mangle_scratch = (bool) mangleScratch; use_compress_expand = (bool)compressFlag; use_compress_reset_expand = (bool)compressResetFlag; + use_literal_api = (bool)literalFlag; } diff --git a/tools/hscollider/common.h b/tools/hscollider/common.h index d9a0144c..67e488c0 100644 --- a/tools/hscollider/common.h +++ b/tools/hscollider/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -82,6 +82,7 @@ extern bool use_copy_stream; extern bool use_mangle_scratch; extern bool use_compress_expand; extern bool use_compress_reset_expand; +extern bool use_literal_api; extern int abort_on_failure; extern int no_signal_handler; extern bool force_edit_distance; diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp index 18d7a016..afa6ef5a 100644 --- a/tools/hscollider/main.cpp +++ b/tools/hscollider/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -118,6 +118,7 @@ bool use_copy_stream = false; bool use_mangle_scratch = false; bool use_compress_expand = false; bool use_compress_reset_expand = false; +bool use_literal_api = false; int abort_on_failure = 0; int no_signal_handler = 0; size_t max_scan_queue_len = 25000; diff --git a/tools/hsdump/main.cpp b/tools/hsdump/main.cpp index 3221d1b6..75db1c4f 100644 --- a/tools/hsdump/main.cpp +++ b/tools/hsdump/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -106,6 +106,8 @@ bool dump_intermediate = true; bool force_edit_distance = false; u32 edit_distance = 0; +int use_literal_api = 0; + } // namespace // Usage statement. @@ -139,6 +141,7 @@ void usage(const char *name, const char *error) { printf(" -8 Force UTF8 mode on all patterns.\n"); printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n"); printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); + printf(" --literal-on Use Hyperscan pure literal matching API.\n"); printf("\n"); printf("Example:\n"); printf("$ %s -e pattern.file -s sigfile\n", name); @@ -163,6 +166,7 @@ void processArgs(int argc, char *argv[], Grey &grey) { {"utf8", no_argument, nullptr, '8'}, {"prefilter", no_argument, &force_prefilter, 1}, {"som-width", required_argument, nullptr, 'd'}, + {"literal-on", no_argument, &use_literal_api, 1}, {nullptr, 0, nullptr, 0} }; @@ -501,9 +505,23 @@ unsigned int dumpDataMulti(const vector &patterns, hs_database_t *db = nullptr; hs_compile_error_t *compile_err; - hs_error_t err = hs_compile_multi_int( - patterns.data(), flags.data(), ids.data(), ext.c_array(), - patterns.size(), mode, plat_info.get(), &db, &compile_err, grey); + hs_error_t err; + const size_t count = patterns.size(); + if (use_literal_api) { + // Compute length of each pattern. + vector lens(count); + for (unsigned int i = 0; i < count; i++) { + lens[i] = strlen(patterns[i]); + } + err = hs_compile_lit_multi_int(patterns.data(), flags.data(), + ids.data(), ext.c_array(), lens.data(), + count, mode, plat_info.get(), &db, + &compile_err, grey); + } else { + err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(), + ext.c_array(), count, mode, plat_info.get(), + &db, &compile_err, grey); + } if (err != HS_SUCCESS) { if (compile_err && compile_err->message) { diff --git a/util/string_util.h b/util/string_util.h index 658eb704..b44586ea 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -127,4 +127,18 @@ void prettyPrintRange(std::ostream &out, it_t begin, it_t end) { } } +// Transfer given string into a hex-escaped pattern. +static really_inline +char *makeHex(const unsigned char *pat, unsigned patlen) { + size_t hexlen = patlen * 4; + char *hexbuf = (char *)malloc(hexlen + 1); + unsigned i; + char *buf; + for (i = 0, buf = hexbuf; i < patlen; i++, buf += 4) { + snprintf(buf, 5, "\\x%02x", (unsigned char)pat[i]); + } + hexbuf[hexlen] = '\0'; + return hexbuf; +} + #endif // STRING_UTIL_H From 435cd23823b0fb6721b4f88bce49ba2a0f5037a7 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Wed, 17 Jul 2019 23:45:59 +0800 Subject: [PATCH 13/18] Literal API: update dev-reference --- doc/dev-reference/compilation.rst | 69 +++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst index 214f4abc..5d2c70f7 100644 --- a/doc/dev-reference/compilation.rst +++ b/doc/dev-reference/compilation.rst @@ -54,6 +54,75 @@ version of Hyperscan used to scan with it. Hyperscan provides support for targeting a database at a particular CPU platform; see :ref:`instr_specialization` for details. +===================== +Compile Pure Literals +===================== + +Pure literal is a special case of regular expression. A character sequence is +regarded as a pure literal if and only if each character is read and +interpreted independently. No syntax association happens between any adjacent +characters. + +For example, given an expression written as :regexp:`/bc?/`. We could say it is +a regluar expression, with the meaning that character ``b`` followed by nothing +or by one character ``c``. On the other view, we could also say it is a pure +literal expression, with the meaning that this is a character sequence of 3-byte +length, containing characters ``b``, ``c`` and ``?``. In regular case, the +question mark character ``?`` has a particular syntax role called 0-1 quantifier, +which has an syntax association with the character ahead of it. Similar +characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``, +``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``. +While in pure literal case, all these meta characters lost extra meanings +expect for that they are just common ASCII codes. + +Hyperscan is initially designed to process common regualr expressions. It is +hence embedded with a complex parser to do comprehensive regular grammer +interpretion. Particularly, the identification of above meta characters is the +basic step for the interpretion of far more complex regular grammers. + +However in real cases, patterns may not always be regualr expressions. They +could just be pure literals. Problem will come if the pure literals contain +regular meta characters. Supposing fed directly into traditional Hyperscan +compile API, all these meta characters will be interpreted in predefined ways, +which is unnecessary and the result is totally out of expectation. To avoid +such misunderstanding by traditional API, users have to preprocess these +literal patterns by converting the meta characters into some other formats: +either by adding a backslash ``\`` before certain meta characters, or by +converting all the characters into a hexadecimal representation. + +In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns: + +#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern + database. + +#. :c:func:`hs_compile_lit_multi`: compiles an array of pure literals into a + pattern database. All of the supplied patterns will be scanned for + concurrently at scan time, with user-supplied identifiers returned when they + match. + +These 2 APIs are designed for use cases where all patterns contained in the +target rule set are pure literals. Users can pass the initial pure literal +content directly into these APIs without worrying about writing regular meta +characters in their patterns. No preprocessing work is needed any more. + +For new APIs, the ``length`` of each literal pattern is a newly added parameter. +Hyperscan needs to locate the end position of the input expression via clearly +knowing each literal's length, not by simply identifying character ``\0`` of a +string. + +Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_MULTILINE`, +:c:member:`HS_FLAG_SINGLEMATCH`, :c:member:`HS_FLAG_SOM_LEFTMOST`. + +.. note:: We don't support literal compilation API with :ref:`extparam`. And + for runtime implementation, traditional runtime APIs can still be + used to match pure literal patterns. + +.. note:: If the target rule set contains at least one regular expression, + please use traditional compile APIs :c:func:`hs_compile`, + :c:func:`hs_compile_multi` and :c:func:`hs_compile_ext_multi`. + The new literal APIs introduced here are designed for rule sets + containing only pure literal expressions. + *************** Pattern Support *************** From 177537313a00569b86cab8d5edd3823c61c9d487 Mon Sep 17 00:00:00 2001 From: "Wang, Xiang W" Date: Tue, 2 Jul 2019 23:37:21 -0400 Subject: [PATCH 14/18] Chimera: don't disable single match flag when checking Hyperscan support --- chimera/ch_compile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chimera/ch_compile.cpp b/chimera/ch_compile.cpp index 374bd7ad..46536f31 100644 --- a/chimera/ch_compile.cpp +++ b/chimera/ch_compile.cpp @@ -322,7 +322,7 @@ PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in, ch_misc_free(info); u32 guardflags; - guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH; + guardflags = flags | HS_FLAG_PREFILTER; guard = isHyperscanSupported(pattern, guardflags, platform); } else { // We can't even prefilter this pattern, so we're dependent on Big Dumb From 49592833a7c507f06385a4b6b2ec343c05eaf8f8 Mon Sep 17 00:00:00 2001 From: "Wang, Xiang W" Date: Tue, 9 Jul 2019 20:18:42 -0400 Subject: [PATCH 15/18] Scratch: fix scratch free issue when memory allocation fails Fixes github issue #174 --- src/scratch.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scratch.c b/src/scratch.c index b4630640..25991e2b 100644 --- a/src/scratch.c +++ b/src/scratch.c @@ -279,7 +279,9 @@ hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db, hs_error_t proto_ret = hs_check_alloc(proto_tmp); if (proto_ret != HS_SUCCESS) { hs_scratch_free(proto_tmp); - hs_scratch_free(*scratch); + if (*scratch) { + hs_scratch_free((*scratch)->scratch_alloc); + } *scratch = NULL; return proto_ret; } From e395cd3166cbe935fa17faa8cac9730b012f066a Mon Sep 17 00:00:00 2001 From: Bobby Martin Date: Wed, 27 Mar 2019 09:52:00 -0700 Subject: [PATCH 16/18] Add windows DLL support (with AVX2 flag removed currently) --- CMakeLists.txt | 18 +++++++-------- hs.def | 43 ++++++++++++++++++++++++++++++++++++ hs_runtime.def | 36 ++++++++++++++++++++++++++++++ tools/hsbench/CMakeLists.txt | 10 +++++---- tools/hscheck/CMakeLists.txt | 6 ++++- tools/hsdump/CMakeLists.txt | 6 ++++- unit/CMakeLists.txt | 4 ++++ 7 files changed, 107 insertions(+), 16 deletions(-) create mode 100644 hs.def create mode 100644 hs_runtime.def diff --git a/CMakeLists.txt b/CMakeLists.txt index d3995362..a37eaa90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ else() endif() if(CMAKE_BUILD_TYPE MATCHES RELEASE|RELWITHDEBINFO|MINSIZEREL) + message(STATUS "using release build") set(RELEASE_BUILD TRUE) else() set(RELEASE_BUILD FALSE) @@ -109,11 +110,9 @@ option(BUILD_SHARED_LIBS "Build shared libs instead of static" OFF) option(BUILD_STATIC_AND_SHARED "Build shared libs as well as static" OFF) if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) - if (WIN32) - message(FATAL_ERROR "Windows DLLs currently not supported") - else() message(STATUS "Building shared libraries") - endif() +else() + message(STATUS "Building static libraries") endif() if (NOT BUILD_SHARED_LIBS) @@ -151,9 +150,6 @@ if(MSVC OR MSVC_IDE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS") else() - # todo: change these as required - set(ARCH_C_FLAGS "/arch:AVX2") - set(ARCH_CXX_FLAGS "/arch:AVX2") set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD") @@ -1298,12 +1294,14 @@ endif() if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) if (NOT FAT_RUNTIME) add_library(hs_runtime_shared SHARED src/hs_version.c - src/hs_valid_platform.c $) + src/hs_valid_platform.c $ + hs_runtime.def) else() add_library(hs_runtime_shared SHARED src/hs_version.c src/hs_valid_platform.c $ - ${RUNTIME_SHLIBS}) + ${RUNTIME_SHLIBS} + hs_runtime.def) endif() set_target_properties(hs_runtime_shared PROPERTIES VERSION ${LIB_VERSION} @@ -1349,7 +1347,7 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) ${RUNTIME_SHLIBS}) endif () - add_library(hs_shared SHARED ${hs_shared_SRCS}) + add_library(hs_shared SHARED ${hs_shared_SRCS} hs.def) add_dependencies(hs_shared ragel_Parser) set_target_properties(hs_shared PROPERTIES diff --git a/hs.def b/hs.def new file mode 100644 index 00000000..28f7877c --- /dev/null +++ b/hs.def @@ -0,0 +1,43 @@ +; Hyperscan DLL export definitions + +LIBRARY hs + +EXPORTS + hs_alloc_scratch + hs_clone_scratch + hs_close_stream + hs_compile + hs_compile_ext_multi + hs_compile_multi + hs_compress_stream + hs_copy_stream + hs_database_info + hs_database_size + hs_deserialize_database + hs_deserialize_database_at + hs_expand_stream + hs_expression_ext_info + hs_expression_info + hs_free_compile_error + hs_free_database + hs_free_scratch + hs_open_stream + hs_populate_platform + hs_reset_and_copy_stream + hs_reset_and_expand_stream + hs_reset_stream + hs_scan + hs_scan_stream + hs_scan_vector + hs_scratch_size + hs_serialize_database + hs_serialized_database_info + hs_serialized_database_size + hs_set_allocator + hs_set_database_allocator + hs_set_misc_allocator + hs_set_scratch_allocator + hs_set_stream_allocator + hs_stream_size + hs_valid_platform + hs_version diff --git a/hs_runtime.def b/hs_runtime.def new file mode 100644 index 00000000..6c434bed --- /dev/null +++ b/hs_runtime.def @@ -0,0 +1,36 @@ +; Hyperscan DLL export definitions + +LIBRARY hs_runtime + +EXPORTS + hs_alloc_scratch + hs_clone_scratch + hs_close_stream + hs_compress_stream + hs_copy_stream + hs_database_info + hs_database_size + hs_deserialize_database + hs_deserialize_database_at + hs_expand_stream + hs_free_database + hs_free_scratch + hs_open_stream + hs_reset_and_copy_stream + hs_reset_and_expand_stream + hs_reset_stream + hs_scan + hs_scan_stream + hs_scan_vector + hs_scratch_size + hs_serialize_database + hs_serialized_database_info + hs_serialized_database_size + hs_set_allocator + hs_set_database_allocator + hs_set_misc_allocator + hs_set_scratch_allocator + hs_set_stream_allocator + hs_stream_size + hs_valid_platform + hs_version \ No newline at end of file diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt index 465081a8..bbceda41 100644 --- a/tools/hsbench/CMakeLists.txt +++ b/tools/hsbench/CMakeLists.txt @@ -56,10 +56,7 @@ if (BUILD_CHIMERA) engine_pcre.cpp engine_pcre.h ) -endif() - -add_executable(hsbench ${hsbench_SOURCES}) -if (BUILD_CHIMERA) + add_executable(hsbench ${hsbench_SOURCES}) include_directories(${PCRE_INCLUDE_DIRS}) if(NOT WIN32) target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil @@ -69,6 +66,11 @@ if (BUILD_CHIMERA) expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) endif() else() + if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)) + add_executable(hsbench ${hsbench_SOURCES} $ $) + else() + add_executable(hsbench ${hsbench_SOURCES}) + endif() target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) endif() diff --git a/tools/hscheck/CMakeLists.txt b/tools/hscheck/CMakeLists.txt index 8f45765a..2ae06137 100644 --- a/tools/hscheck/CMakeLists.txt +++ b/tools/hscheck/CMakeLists.txt @@ -16,7 +16,11 @@ if (BUILD_CHIMERA) target_link_libraries(hscheck hs chimera pcre expressionutil) endif() else() - add_executable(hscheck ${hscheck_SOURCES}) + if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)) + add_executable(hscheck ${hscheck_SOURCES} $ $) + else() + add_executable(hscheck ${hscheck_SOURCES}) + endif() if(NOT WIN32) target_link_libraries(hscheck hs expressionutil pthread) else() diff --git a/tools/hsdump/CMakeLists.txt b/tools/hsdump/CMakeLists.txt index 4350b0f6..0466d572 100644 --- a/tools/hsdump/CMakeLists.txt +++ b/tools/hsdump/CMakeLists.txt @@ -10,6 +10,10 @@ include_directories(${PROJECT_SOURCE_DIR}/util) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") -add_executable(hsdump main.cpp) +if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)) + add_executable(hsdump main.cpp $ $) +else() + add_executable(hsdump main.cpp) +endif() target_link_libraries(hsdump hs expressionutil crosscompileutil) diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 32e01450..b0706fa8 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -129,7 +129,11 @@ set(unit_internal_SOURCES internal/main.cpp ) +if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)) +add_executable(unit-internal ${unit_internal_SOURCES} $ $) +else() add_executable(unit-internal ${unit_internal_SOURCES}) +endif() set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}") target_link_libraries(unit-internal hs corpusomatic) endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) From fb42be1539b72f0ed5adac2975026d0d7e59e586 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Fri, 12 Jul 2019 22:55:44 +0800 Subject: [PATCH 17/18] changelog: updates for 5.2.0 release --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2de58a7b..bc8910bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ This is a list of notable changes to Hyperscan, in reverse chronological order. +## [5.2.0] 2019-07-12 +- Literal API: add new API `hs_compile_lit()` and `hs_compile_lit_multi()` to + process pure literal rule sets. The 2 literal APIs treat each expression text + in a literal sense without recognizing any regular grammers. +- Logical combination: add support for purely negative combinations, which + report match at EOD in case of no sub-expressions matched. +- Windows porting: support shared library (DLL) on Windows with available tools + hscheck, hsbench and hsdump. +- Bugfix for issue #148: fix uninitialized use of `scatter_unit_uX` due to + padding. +- Bugfix for issue #155: fix numerical result out of range error. +- Bugfix for issue #165: avoid corruption of pending combination report in + streaming mode. +- Bugfix for issue #174: fix scratch free issue when memory allocation fails. + ## [5.1.1] 2019-04-03 - Add extra detection and handling when invalid rose programs are triggered. - Bugfix for issue #136: fix CMake parsing of CPU architecure for GCC-9. From ae8c8ee1c85ab3df4fb515ba52f3942c4ab55926 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Fri, 12 Jul 2019 23:01:00 +0800 Subject: [PATCH 18/18] Bump version number for release --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a37eaa90..3801f994 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11) project (hyperscan C CXX) set (HS_MAJOR_VERSION 5) -set (HS_MINOR_VERSION 1) -set (HS_PATCH_VERSION 1) +set (HS_MINOR_VERSION 2) +set (HS_PATCH_VERSION 0) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)