Merge branch develop to master

This commit is contained in:
Chang, Harry 2019-01-30 10:22:48 +08:00
commit 90cd1863d6
43 changed files with 4093 additions and 2783 deletions

View File

@ -2,6 +2,24 @@
This is a list of notable changes to Hyperscan, in reverse chronological order. This is a list of notable changes to Hyperscan, in reverse chronological order.
## [5.1.0] 2019-01-17
- Improve DFA state compression by wide-state optimization to reduce bytecode
size.
- Create specific interpreter runtime handling to boost the performance of pure
literal matching.
- Optimize original presentation of interpreter (the "Rose" engine ) to
increase overall performance.
- Bugfix for logical combinations: fix error reporting combination's match in
case of sub-expression has EOD match under streaming mode.
- Bugfix for logical combinations: fix miss reporting combination's match under
vacuous input.
- Bugfix for issue #104: fix compile error with Boost 1.68.0.
- Bugfix for issue #127: avoid pcre error for hscollider with installed PCRE
package.
- Update version of PCRE used by testing tools as a syntax and semantic
reference to PCRE 8.41 or above.
- Fix github repo address in doc.
## [5.0.0] 2018-07-09 ## [5.0.0] 2018-07-09
- Introduce chimera hybrid engine of Hyperscan and PCRE, to fully support - Introduce chimera hybrid engine of Hyperscan and PCRE, to fully support
PCRE syntax as well as to take advantage of the high performance nature of PCRE syntax as well as to take advantage of the high performance nature of

View File

@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 2.8.11)
project (hyperscan C CXX) project (hyperscan C CXX)
set (HS_MAJOR_VERSION 5) set (HS_MAJOR_VERSION 5)
set (HS_MINOR_VERSION 0) set (HS_MINOR_VERSION 1)
set (HS_PATCH_VERSION 0) set (HS_PATCH_VERSION 0)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
@ -456,7 +456,7 @@ set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake) include (${CMAKE_MODULE_PATH}/pcre.cmake)
if (NOT CORRECT_PCRE_VERSION) if (NOT CORRECT_PCRE_VERSION)
message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found") message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} or above not found")
endif() endif()
# we need static libs for Chimera - too much deep magic for shared libs # we need static libs for Chimera - too much deep magic for shared libs
@ -508,7 +508,7 @@ set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake) include (${CMAKE_MODULE_PATH}/pcre.cmake)
if (NOT CORRECT_PCRE_VERSION) if (NOT CORRECT_PCRE_VERSION)
message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found") message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} or above not found")
endif() endif()
# we need static libs for Chimera - too much deep magic for shared libs # we need static libs for Chimera - too much deep magic for shared libs

View File

@ -714,7 +714,7 @@ ch_error_t HS_CDECL ch_compile(const char *expression, unsigned flags,
(int)e.index : -1); (int)e.index : -1);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;
} }
catch (std::bad_alloc) { catch (std::bad_alloc &) {
*db = nullptr; *db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem); *comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;
@ -782,7 +782,7 @@ ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
(int)e.index : -1); (int)e.index : -1);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;
} }
catch (std::bad_alloc) { catch (std::bad_alloc &) {
*db = nullptr; *db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem); *comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;
@ -855,7 +855,7 @@ ch_error_t HS_CDECL ch_compile_ext_multi(
(int)e.index : -1); (int)e.index : -1);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;
} }
catch (std::bad_alloc) { catch (std::bad_alloc &) {
*db = nullptr; *db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem); *comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR; return CH_COMPILER_ERROR;

View File

@ -216,7 +216,6 @@ ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *hydb,
} }
if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) { if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) {
(*scratch)->multi_scratch = NULL;
return CH_SUCCESS; return CH_SUCCESS;
} }

View File

@ -27,7 +27,7 @@ if (PCRE_BUILD_SOURCE)
# first, check version number # first, check version number
CHECK_C_SOURCE_COMPILES("#include <pcre.h.generic> CHECK_C_SOURCE_COMPILES("#include <pcre.h.generic>
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR != ${PCRE_REQUIRED_MINOR_VERSION} #if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
#error Incorrect pcre version #error Incorrect pcre version
#endif #endif
main() {}" CORRECT_PCRE_VERSION) main() {}" CORRECT_PCRE_VERSION)
@ -35,10 +35,10 @@ if (PCRE_BUILD_SOURCE)
if (NOT CORRECT_PCRE_VERSION) if (NOT CORRECT_PCRE_VERSION)
unset(CORRECT_PCRE_VERSION CACHE) unset(CORRECT_PCRE_VERSION CACHE)
message(STATUS "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} is required") message(STATUS "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} or above is required")
return () return ()
else() else()
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} - building from source.") message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} or above - building from source.")
endif() endif()
# PCRE compile options # PCRE compile options
@ -52,12 +52,12 @@ if (PCRE_BUILD_SOURCE)
else () else ()
# pkgconf should save us # pkgconf should save us
find_package(PkgConfig) find_package(PkgConfig)
pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION}) pkg_check_modules(PCRE libpcre>=${PCRE_REQUIRED_VERSION})
if (PCRE_FOUND) if (PCRE_FOUND)
set(CORRECT_PCRE_VERSION TRUE) set(CORRECT_PCRE_VERSION TRUE)
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}") message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} or above")
else () else ()
message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} not found") message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} or above not found")
return () return ()
endif () endif ()
endif (PCRE_BUILD_SOURCE) endif (PCRE_BUILD_SOURCE)

View File

@ -64,7 +64,7 @@ libpcre are supported. The use of unsupported constructs will result in
compilation errors. compilation errors.
The version of PCRE used to validate Hyperscan's interpretation of this syntax The version of PCRE used to validate Hyperscan's interpretation of this syntax
is 8.41. is 8.41 or above.
==================== ====================
Supported Constructs Supported Constructs

View File

@ -10,7 +10,7 @@ Very Quick Start
#. Clone Hyperscan :: #. Clone Hyperscan ::
cd <where-you-want-hyperscan-source> cd <where-you-want-hyperscan-source>
git clone git://github/intel/hyperscan git clone git://github.com/intel/hyperscan
#. Configure Hyperscan #. Configure Hyperscan

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -807,6 +807,9 @@ void findIncludedLits(vector<hwlmLiteral> &lits,
for (size_t i = 0; i < cnt; i++) { for (size_t i = 0; i < cnt; i++) {
u32 bucket1 = group[i].first; u32 bucket1 = group[i].first;
u32 id1 = group[i].second; u32 id1 = group[i].second;
if (lits[id1].pure) {
continue;
}
buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map, buildSquashMask(lits, id1, bucket1, i + 1, group, parent_map,
exception_map); exception_map);
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -62,6 +62,7 @@ struct LitInfo {
u8 size; u8 size;
u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above. u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above.
u8 next; u8 next;
u8 pure; //!< The pass-on of pure flag from hwlmLiteral.
}; };
#define FDRC_FLAG_NO_CONFIRM 1 #define FDRC_FLAG_NO_CONFIRM 1

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -87,6 +87,7 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
info.flags = flags; info.flags = flags;
info.size = verify_u8(max(lit.msk.size(), lit.s.size())); info.size = verify_u8(max(lit.msk.size(), lit.s.size()));
info.groups = lit.groups; info.groups = lit.groups;
info.pure = lit.pure;
// these are built up assuming a LE machine // these are built up assuming a LE machine
CONF_TYPE msk = all_ones; CONF_TYPE msk = all_ones;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -65,6 +65,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
u8 oldNext; // initialized in loop u8 oldNext; // initialized in loop
do { do {
assert(ISALIGNED(li)); assert(ISALIGNED(li));
scratch->pure = li->pure;
if (unlikely((conf_key & li->msk) != li->v)) { if (unlikely((conf_key & li->msk) != li->v)) {
goto out; goto out;
@ -99,6 +100,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
li++; li++;
} while (oldNext); } while (oldNext);
scratch->fdr_conf = NULL; scratch->fdr_conf = NULL;
scratch->pure = 0;
} }
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -82,6 +82,7 @@ Grey::Grey(void) :
onlyOneOutfix(false), onlyOneOutfix(false),
allowShermanStates(true), allowShermanStates(true),
allowMcClellan8(true), allowMcClellan8(true),
allowWideStates(true), // enable wide state for McClellan8
highlanderPruneDFA(true), highlanderPruneDFA(true),
minimizeDFA(true), minimizeDFA(true),
accelerateDFA(true), accelerateDFA(true),
@ -197,7 +198,15 @@ void applyGreyOverrides(Grey *g, const string &s) {
string::const_iterator ve = find(ke, pe, ';'); string::const_iterator ve = find(ke, pe, ';');
unsigned int value = lexical_cast<unsigned int>(string(ke + 1, ve)); unsigned int value = 0;
try {
value = lexical_cast<unsigned int>(string(ke + 1, ve));
} catch (boost::bad_lexical_cast &e) {
printf("Invalid grey override key %s:%s\n", key.c_str(),
string(ke + 1, ve).c_str());
invalid_key_seen = true;
break;
}
bool done = false; bool done = false;
/* surely there exists a nice template to go with this macro to make /* surely there exists a nice template to go with this macro to make
@ -251,6 +260,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
G_UPDATE(onlyOneOutfix); G_UPDATE(onlyOneOutfix);
G_UPDATE(allowShermanStates); G_UPDATE(allowShermanStates);
G_UPDATE(allowMcClellan8); G_UPDATE(allowMcClellan8);
G_UPDATE(allowWideStates);
G_UPDATE(highlanderPruneDFA); G_UPDATE(highlanderPruneDFA);
G_UPDATE(minimizeDFA); G_UPDATE(minimizeDFA);
G_UPDATE(accelerateDFA); G_UPDATE(accelerateDFA);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -87,6 +87,7 @@ struct Grey {
bool allowShermanStates; bool allowShermanStates;
bool allowMcClellan8; bool allowMcClellan8;
bool allowWideStates; // enable wide state for McClellan8
bool highlanderPruneDFA; bool highlanderPruneDFA;
bool minimizeDFA; bool minimizeDFA;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -83,9 +83,10 @@ bool maskIsConsistent(const std::string &s, bool nocase, const vector<u8> &msk,
* \ref HWLM_MASKLEN. */ * \ref HWLM_MASKLEN. */
hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in, hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
bool noruns_in, u32 id_in, hwlm_group_t groups_in, bool noruns_in, u32 id_in, hwlm_group_t groups_in,
const vector<u8> &msk_in, const vector<u8> &cmp_in) const vector<u8> &msk_in, const vector<u8> &cmp_in,
bool pure_in)
: s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in), : s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in),
groups(groups_in), msk(msk_in), cmp(cmp_in) { groups(groups_in), msk(msk_in), cmp(cmp_in), pure(pure_in) {
assert(s.size() <= HWLM_LITERAL_MAX_LEN); assert(s.size() <= HWLM_LITERAL_MAX_LEN);
assert(msk.size() <= HWLM_MASKLEN); assert(msk.size() <= HWLM_MASKLEN);
assert(msk.size() == cmp.size()); assert(msk.size() == cmp.size());

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -113,15 +113,20 @@ struct hwlmLiteral {
*/ */
std::vector<u8> cmp; std::vector<u8> cmp;
bool pure; //!< \brief The pass-on of pure flag from LitFragment.
/** \brief Complete constructor, takes group information and msk/cmp. /** \brief Complete constructor, takes group information and msk/cmp.
* *
* This constructor takes a msk/cmp pair. Both must be vectors of length <= * This constructor takes a msk/cmp pair. Both must be vectors of length <=
* \ref HWLM_MASKLEN. */ * \ref HWLM_MASKLEN. */
hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in, hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in,
u32 id_in, hwlm_group_t groups_in, u32 id_in, hwlm_group_t groups_in,
const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in); const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in,
bool pure_in = false);
/** \brief Simple constructor: no group information, no msk/cmp. */ /** \brief Simple constructor: no group information, no msk/cmp.
*
* This constructor is only used in internal unit test. */
hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in) hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
: hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {} : hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {}
}; };

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -40,6 +40,11 @@ namespace ue2 {
class ReportManager; class ReportManager;
struct Grey; struct Grey;
enum DfaType {
McClellan,
Sheng,
Gough
};
class accel_dfa_build_strat : public dfa_build_strat { class accel_dfa_build_strat : public dfa_build_strat {
public: public:
@ -53,6 +58,8 @@ public:
virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info, virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
void *accel_out); void *accel_out);
virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey); virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey);
virtual DfaType getType() const = 0;
private: private:
bool only_accel_init; bool only_accel_init;
}; };

View File

@ -91,6 +91,7 @@ public:
void buildAccel(dstate_id_t this_idx, const AccelScheme &info, void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
void *accel_out) override; void *accel_out) override;
u32 max_allowed_offset_accel() const override { return 0; } u32 max_allowed_offset_accel() const override { return 0; }
DfaType getType() const override { return Gough; }
raw_som_dfa &rdfa; raw_som_dfa &rdfa;
const GoughGraph &gg; const GoughGraph &gg;

View File

@ -980,7 +980,7 @@ u32 addSquashMask(const build_info &args, const NFAVertex &v,
// see if we've already seen it, otherwise add a new one. // see if we've already seen it, otherwise add a new one.
auto it = find(squash.begin(), squash.end(), sit->second); auto it = find(squash.begin(), squash.end(), sit->second);
if (it != squash.end()) { if (it != squash.end()) {
return verify_u32(distance(squash.begin(), it)); return verify_u32(std::distance(squash.begin(), it));
} }
u32 idx = verify_u32(squash.size()); u32 idx = verify_u32(squash.size());
squash.push_back(sit->second); squash.push_back(sit->second);
@ -1007,7 +1007,7 @@ u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
auto it = search(begin(reports), end(reports), begin(my_reports), auto it = search(begin(reports), end(reports), begin(my_reports),
end(my_reports)); end(my_reports));
if (it != end(reports)) { if (it != end(reports)) {
u32 offset = verify_u32(distance(begin(reports), it)); u32 offset = verify_u32(std::distance(begin(reports), it));
DEBUG_PRINTF("reusing found report list at %u\n", offset); DEBUG_PRINTF("reusing found report list at %u\n", offset);
return offset; return offset;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -167,9 +167,68 @@ u32 doNormal16(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
} }
static really_inline static really_inline
char mcclellanExec16_i(const struct mcclellan *m, u32 *state, const u8 *buf, u32 doNormalWide16(const struct mcclellan *m, const u8 **c_inout,
size_t len, u64a offAdj, NfaCallback cb, void *ctxt, const u8 *end, u32 s, char *qstate, u16 *offset,
char single, const u8 **c_final, enum MatchMode mode) { char do_accel, enum MatchMode mode) {
const u8 *c = *c_inout;
u32 wide_limit = m->wide_limit;
const char *wide_base
= (const char *)m - sizeof(struct NFA) + m->wide_offset;
const u16 *succ_table
= (const u16 *)((const char *)m + sizeof(struct mcclellan));
assert(ISALIGNED_N(succ_table, 2));
u32 sherman_base = m->sherman_limit;
const char *sherman_base_offset
= (const char *)m - sizeof(struct NFA) + m->sherman_offset;
u32 as = m->alphaShift;
s &= STATE_MASK;
while (c < end && s) {
u8 cprime = m->remap[*c];
DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u) &c: %p\n", *c,
ourisprint(*c) ? *c : '?', cprime, s, c);
if (unlikely(s >= wide_limit)) {
const char *wide_entry
= findWideEntry16(m, wide_base, wide_limit, s);
DEBUG_PRINTF("doing wide head (%u)\n", s);
s = doWide16(wide_entry, &c, end, m->remap, (u16 *)&s, qstate,
offset);
} else if (s >= sherman_base) {
const char *sherman_state
= findShermanState(m, sherman_base_offset, sherman_base, s);
DEBUG_PRINTF("doing sherman (%u)\n", s);
s = doSherman16(sherman_state, cprime, succ_table, as);
} else {
DEBUG_PRINTF("doing normal\n");
s = succ_table[(s << as) + cprime];
}
DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
c++;
if (do_accel && (s & ACCEL_FLAG)) {
break;
}
if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
break;
}
s &= STATE_MASK;
}
*c_inout = c;
return s;
}
static really_inline
char mcclellanExec16_i(const struct mcclellan *m, u32 *state, char *qstate,
const u8 *buf, size_t len, u64a offAdj, NfaCallback cb,
void *ctxt, char single, const u8 **c_final,
enum MatchMode mode) {
assert(ISALIGNED_N(state, 2)); assert(ISALIGNED_N(state, 2));
if (!len) { if (!len) {
if (mode == STOP_AT_MATCH) { if (mode == STOP_AT_MATCH) {
@ -179,6 +238,7 @@ char mcclellanExec16_i(const struct mcclellan *m, u32 *state, const u8 *buf,
} }
u32 s = *state; u32 s = *state;
u16 offset = 0;
const u8 *c = buf; const u8 *c = buf;
const u8 *c_end = buf + len; const u8 *c_end = buf + len;
const struct mstate_aux *aux const struct mstate_aux *aux
@ -207,7 +267,12 @@ without_accel:
goto exit; goto exit;
} }
s = doNormal16(m, &c, min_accel_offset, s, 0, mode); if (unlikely(m->has_wide)) {
s = doNormalWide16(m, &c, min_accel_offset, s, qstate, &offset, 0,
mode);
} else {
s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
}
if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
if (mode == STOP_AT_MATCH) { if (mode == STOP_AT_MATCH) {
@ -259,7 +324,11 @@ with_accel:
} }
} }
s = doNormal16(m, &c, c_end, s, 1, mode); if (unlikely(m->has_wide)) {
s = doNormalWide16(m, &c, c_end, s, qstate, &offset, 1, mode);
} else {
s = doNormal16(m, &c, c_end, s, 1, mode);
}
if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
if (mode == STOP_AT_MATCH) { if (mode == STOP_AT_MATCH) {
@ -297,44 +366,47 @@ exit:
} }
static never_inline static never_inline
char mcclellanExec16_i_cb(const struct mcclellan *m, u32 *state, const u8 *buf, char mcclellanExec16_i_cb(const struct mcclellan *m, u32 *state, char *qstate,
size_t len, u64a offAdj, NfaCallback cb, void *ctxt, const u8 *buf, size_t len, u64a offAdj,
char single, const u8 **final_point) { NfaCallback cb, void *ctxt, char single,
return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, const u8 **final_point) {
final_point, CALLBACK_OUTPUT); return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
single, final_point, CALLBACK_OUTPUT);
} }
static never_inline static never_inline
char mcclellanExec16_i_sam(const struct mcclellan *m, u32 *state, const u8 *buf, char mcclellanExec16_i_sam(const struct mcclellan *m, u32 *state, char *qstate,
size_t len, u64a offAdj, NfaCallback cb, void *ctxt, const u8 *buf, size_t len, u64a offAdj,
char single, const u8 **final_point) { NfaCallback cb, void *ctxt, char single,
return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, const u8 **final_point) {
final_point, STOP_AT_MATCH); return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
single, final_point, STOP_AT_MATCH);
} }
static never_inline static never_inline
char mcclellanExec16_i_nm(const struct mcclellan *m, u32 *state, const u8 *buf, char mcclellanExec16_i_nm(const struct mcclellan *m, u32 *state, char *qstate,
size_t len, u64a offAdj, NfaCallback cb, void *ctxt, const u8 *buf, size_t len, u64a offAdj,
char single, const u8 **final_point) { NfaCallback cb, void *ctxt, char single,
return mcclellanExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, const u8 **final_point) {
final_point, NO_MATCHES); return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt,
single, final_point, NO_MATCHES);
} }
static really_inline static really_inline
char mcclellanExec16_i_ni(const struct mcclellan *m, u32 *state, const u8 *buf, char mcclellanExec16_i_ni(const struct mcclellan *m, u32 *state, char *qstate,
size_t len, u64a offAdj, NfaCallback cb, void *ctxt, const u8 *buf, size_t len, u64a offAdj,
char single, const u8 **final_point, NfaCallback cb, void *ctxt, char single,
enum MatchMode mode) { const u8 **final_point, enum MatchMode mode) {
if (mode == CALLBACK_OUTPUT) { if (mode == CALLBACK_OUTPUT) {
return mcclellanExec16_i_cb(m, state, buf, len, offAdj, cb, ctxt, return mcclellanExec16_i_cb(m, state, qstate, buf, len, offAdj, cb,
single, final_point); ctxt, single, final_point);
} else if (mode == STOP_AT_MATCH) { } else if (mode == STOP_AT_MATCH) {
return mcclellanExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt, return mcclellanExec16_i_sam(m, state, qstate, buf, len, offAdj, cb,
single, final_point); ctxt, single, final_point);
} else { } else {
assert(mode == NO_MATCHES); assert(mode == NO_MATCHES);
return mcclellanExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt, return mcclellanExec16_i_nm(m, state, qstate, buf, len, offAdj, cb,
single, final_point); ctxt, single, final_point);
} }
} }
@ -540,6 +612,10 @@ char mcclellanCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
const struct mcclellan *m = getImplNfa(nfa); const struct mcclellan *m = getImplNfa(nfa);
const struct mstate_aux *aux = get_aux(m, s); const struct mstate_aux *aux = get_aux(m, s);
if (m->has_wide == 1 && s >= m->wide_limit) {
return MO_CONTINUE_MATCHING;
}
if (!aux->accept_eod) { if (!aux->accept_eod) {
return MO_CONTINUE_MATCHING; return MO_CONTINUE_MATCHING;
} }
@ -612,9 +688,9 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
/* do main buffer region */ /* do main buffer region */
const u8 *final_look; const u8 *final_look;
char rv = mcclellanExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp, char rv = mcclellanExec16_i_ni(m, &s, q->state, cur_buf + sp,
offset + sp, cb, context, single, local_ep - sp, offset + sp, cb, context,
&final_look, mode); single, &final_look, mode);
if (rv == MO_DEAD) { if (rv == MO_DEAD) {
*(u16 *)q->state = 0; *(u16 *)q->state = 0;
return MO_DEAD; return MO_DEAD;
@ -684,12 +760,16 @@ char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
const struct mcclellan *m = getImplNfa(n); const struct mcclellan *m = getImplNfa(n);
u32 s = m->start_anchored; u32 s = m->start_anchored;
if (mcclellanExec16_i(m, &s, buffer, length, offset, cb, context, single, if (mcclellanExec16_i(m, &s, NULL, buffer, length, offset, cb, context,
NULL, CALLBACK_OUTPUT) single, NULL, CALLBACK_OUTPUT)
== MO_DEAD) { == MO_DEAD) {
return s ? MO_ALIVE : MO_DEAD; return s ? MO_ALIVE : MO_DEAD;
} }
if (m->has_wide == 1 && s >= m->wide_limit) {
return MO_ALIVE;
}
const struct mstate_aux *aux = get_aux(m, s); const struct mstate_aux *aux = get_aux(m, s);
if (aux->accept_eod) { if (aux->accept_eod) {
@ -768,6 +848,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
offset + sp, cb, context, single, offset + sp, cb, context, single,
&final_look, mode); &final_look, mode);
if (rv == MO_HALT_MATCHING) { if (rv == MO_HALT_MATCHING) {
*(u8 *)q->state = 0; *(u8 *)q->state = 0;
return MO_DEAD; return MO_DEAD;
@ -1016,7 +1097,8 @@ char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
u16 s = *(u16 *)q->state; u16 s = *(u16 *)q->state;
DEBUG_PRINTF("checking accepts for %hu\n", s); DEBUG_PRINTF("checking accepts for %hu\n", s);
return mcclellanHasAccept(m, get_aux(m, s), report); return (m->has_wide == 1 && s >= m->wide_limit) ?
0 : mcclellanHasAccept(m, get_aux(m, s), report);
} }
char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) { char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) {
@ -1026,7 +1108,8 @@ char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) {
u16 s = *(u16 *)q->state; u16 s = *(u16 *)q->state;
DEBUG_PRINTF("checking accepts for %hu\n", s); DEBUG_PRINTF("checking accepts for %hu\n", s);
return !!get_aux(m, s)->accept; return (m->has_wide == 1 && s >= m->wide_limit) ?
0 : !!get_aux(m, s)->accept;
} }
char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) { char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) {
@ -1111,6 +1194,12 @@ char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
void *state, UNUSED u8 key) { void *state, UNUSED u8 key) {
const struct mcclellan *m = getImplNfa(nfa); const struct mcclellan *m = getImplNfa(nfa);
u16 s = offset ? m->start_floating : m->start_anchored; u16 s = offset ? m->start_floating : m->start_anchored;
// new byte
if (m->has_wide) {
unaligned_store_u16((u16 *)state + 1, 0);
}
if (s) { if (s) {
unaligned_store_u16(state, s); unaligned_store_u16(state, s);
return 1; return 1;
@ -1140,14 +1229,24 @@ void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
const u8 *buf, char top, size_t start_off, const u8 *buf, char top, size_t start_off,
size_t len, NfaCallback cb, void *ctxt) { size_t len, NfaCallback cb, void *ctxt) {
const struct mcclellan *m = getImplNfa(nfa); const struct mcclellan *m = getImplNfa(nfa);
u32 s;
u32 s = top ? m->start_anchored : unaligned_load_u16(state); if (top) {
s = m->start_anchored;
// new byte
if (m->has_wide) {
unaligned_store_u16((u16 *)state + 1, 0);
}
} else {
s = unaligned_load_u16(state);
}
if (m->flags & MCCLELLAN_FLAG_SINGLE) { if (m->flags & MCCLELLAN_FLAG_SINGLE) {
mcclellanExec16_i(m, &s, buf + start_off, len - start_off, mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off,
start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT);
} else { } else {
mcclellanExec16_i(m, &s, buf + start_off, len - start_off, mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off,
start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT); start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT);
} }
@ -1178,9 +1277,16 @@ char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa,
char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa, char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa,
struct mq *q) { struct mq *q) {
assert(nfa->scratchStateSize == 2); const struct mcclellan *m = getImplNfa(nfa);
assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
: nfa->scratchStateSize == 2);
assert(ISALIGNED_N(q->state, 2)); assert(ISALIGNED_N(q->state, 2));
*(u16 *)q->state = 0; *(u16 *)q->state = 0;
// new byte
if (m->has_wide) {
unaligned_store_u16((u16 *)q->state + 1, 0);
}
return 0; return 0;
} }
@ -1206,21 +1312,39 @@ char nfaExecMcClellan8_expandState(UNUSED const struct NFA *nfa, void *dest,
char nfaExecMcClellan16_queueCompressState(UNUSED const struct NFA *nfa, char nfaExecMcClellan16_queueCompressState(UNUSED const struct NFA *nfa,
const struct mq *q, const struct mq *q,
UNUSED s64a loc) { UNUSED s64a loc) {
const struct mcclellan *m = getImplNfa(nfa);
void *dest = q->streamState; void *dest = q->streamState;
const void *src = q->state; const void *src = q->state;
assert(nfa->scratchStateSize == 2); assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
assert(nfa->streamStateSize == 2); : nfa->scratchStateSize == 2);
assert(m->has_wide == 1 ? nfa->streamStateSize == 4
: nfa->streamStateSize == 2);
assert(ISALIGNED_N(src, 2)); assert(ISALIGNED_N(src, 2));
unaligned_store_u16(dest, *(const u16 *)(src)); unaligned_store_u16(dest, *(const u16 *)(src));
// new byte
if (m->has_wide) {
unaligned_store_u16((u16 *)dest + 1, *((const u16 *)src + 1));
}
return 0; return 0;
} }
char nfaExecMcClellan16_expandState(UNUSED const struct NFA *nfa, void *dest, char nfaExecMcClellan16_expandState(UNUSED const struct NFA *nfa, void *dest,
const void *src, UNUSED u64a offset, const void *src, UNUSED u64a offset,
UNUSED u8 key) { UNUSED u8 key) {
assert(nfa->scratchStateSize == 2); const struct mcclellan *m = getImplNfa(nfa);
assert(nfa->streamStateSize == 2); assert(m->has_wide == 1 ? nfa->scratchStateSize == 4
: nfa->scratchStateSize == 2);
assert(m->has_wide == 1 ? nfa->streamStateSize == 4
: nfa->streamStateSize == 2);
assert(ISALIGNED_N(dest, 2)); assert(ISALIGNED_N(dest, 2));
*(u16 *)dest = unaligned_load_u16(src); *(u16 *)dest = unaligned_load_u16(src);
// new byte
if (m->has_wide) {
*((u16 *)dest + 1) = unaligned_load_u16((const u16 *)src + 1);
}
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -82,3 +82,108 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET); u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
return succ_table[(daddy << as) + cprime]; return succ_table[(daddy << as) + cprime];
} }
static really_inline
u16 doWide16(const char *wide_entry, const u8 **c_inout, const u8 *end,
const u8 *remap, const u16 *s, char *qstate, u16 *offset) {
// Internal relative offset after the last visit of the wide state.
if (qstate != NULL) { // stream mode
*offset = unaligned_load_u16((const u16 *)(qstate + 2));
}
u8 successful = 0;
const u8 *c = *c_inout;
u32 len_c = end - c;
u16 width = *(const u16 *)(wide_entry + WIDE_WIDTH_OFFSET);
assert(width >= 8);
const u8 *symbols = (const u8 *)(wide_entry + WIDE_SYMBOL_OFFSET16);
const u16 *trans = (const u16 *)(wide_entry +
WIDE_TRANSITION_OFFSET16(width));
assert(*offset < width);
u16 len_w = width - *offset;
const u8 *sym = symbols + *offset;
char tmp[16];
u16 pos = 0;
if (*offset == 0 && remap[*c] != *sym) {
goto normal;
}
// both in (16, +oo).
while (len_w >= 16 && len_c >= 16) {
m128 str_w = loadu128(sym);
for (size_t i = 0; i < 16; i++) {
tmp[i] = remap[*(c + i)];
}
m128 str_c = loadu128(tmp);
u32 z = movemask128(eq128(str_w, str_c));
pos = ctz32(~z);
assert(pos <= 16);
if (pos < 16) {
goto normal;
}
sym += 16;
c += 16;
len_w -= 16;
len_c -= 16;
}
pos = 0;
// at least one in (0, 16).
u32 loadLength_w = MIN(len_w, 16);
u32 loadLength_c = MIN(len_c, 16);
m128 str_w = loadbytes128(sym, loadLength_w);
for (size_t i = 0; i < loadLength_c; i++) {
tmp[i] = remap[*(c + i)];
}
m128 str_c = loadbytes128(tmp, loadLength_c);
u32 z = movemask128(eq128(str_w, str_c));
pos = ctz32(~z);
pos = MIN(pos, MIN(loadLength_w, loadLength_c));
if (loadLength_w <= loadLength_c) {
assert(pos <= loadLength_w);
// successful matching.
if (pos == loadLength_w) {
c -= 1;
successful = 1;
}
// failure, do nothing.
} else {
assert(pos <= loadLength_c);
// successful partial matching.
if (pos == loadLength_c) {
c -= 1;
goto partial;
}
// failure, do nothing.
}
normal:
*offset = 0;
if (qstate != NULL) {
// Internal relative offset.
unaligned_store_u16(qstate + 2, *offset);
}
c += pos;
*c_inout = c;
return successful ? *trans : *(trans + 1 + remap[*c]);
partial:
*offset = sym - symbols + pos;
if (qstate != NULL) {
// Internal relative offset.
unaligned_store_u16(qstate + 2, *offset);
}
c += pos;
*c_inout = c;
return *s;
}

View File

@ -50,6 +50,16 @@ extern "C"
#define SHERMAN_CHARS_OFFSET 4 #define SHERMAN_CHARS_OFFSET 4
#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len)) #define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len))
#define WIDE_STATE 2
#define WIDE_ENTRY_OFFSET8(weo_pos) (2 + (weo_pos))
#define WIDE_ENTRY_OFFSET16(weo_pos) (4 + (weo_pos))
#define WIDE_WIDTH_OFFSET 0
#define WIDE_SYMBOL_OFFSET8 1
#define WIDE_TRANSITION_OFFSET8(wto_width) (1 + (wto_width))
#define WIDE_SYMBOL_OFFSET16 2
#define WIDE_TRANSITION_OFFSET16(wto_width) (2 + ROUNDUP_N(wto_width, 2))
struct report_list { struct report_list {
u32 count; u32 count;
ReportID report[]; ReportID report[];
@ -79,13 +89,17 @@ struct mcclellan {
u16 accel_limit_8; /**< 8 bit, lowest accelerable state */ u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
u16 accept_limit_8; /**< 8 bit, lowest accept state */ u16 accept_limit_8; /**< 8 bit, lowest accept state */
u16 sherman_limit; /**< lowest sherman state */ u16 sherman_limit; /**< lowest sherman state */
u16 wide_limit; /**< 8/16 bit, lowest wide head state */
u8 alphaShift; u8 alphaShift;
u8 flags; u8 flags;
u8 has_accel; /**< 1 iff there are any accel plans */ u8 has_accel; /**< 1 iff there are any accel plans */
u8 has_wide; /**< 1 iff there exists any wide state */
u8 remap[256]; /**< remaps characters to a smaller alphabet */ u8 remap[256]; /**< remaps characters to a smaller alphabet */
ReportID arb_report; /**< one of the accepts that this dfa may raise */ ReportID arb_report; /**< one of the accepts that this dfa may raise */
u32 accel_offset; /**< offset of accel structures from start of McClellan */ u32 accel_offset; /**< offset of accel structures from start of McClellan */
u32 haig_offset; /**< reserved for use by Haig, relative to start of NFA */ u32 haig_offset; /**< reserved for use by Haig, relative to start of NFA */
u32 wide_offset; /**< offset of the wide state entries to the start of the
* nfa structure */
}; };
static really_inline static really_inline
@ -106,6 +120,43 @@ char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base,
return sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); return sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
} }
static really_inline
const char *findWideEntry8(UNUSED const struct mcclellan *m,
const char *wide_base, u32 wide_limit, u32 s) {
UNUSED u8 type = *(const u8 *)wide_base;
assert(type == WIDE_STATE);
const u32 entry_offset
= *(const u32 *)(wide_base
+ WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32)));
const char *rv = wide_base + entry_offset;
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
return rv;
}
static really_inline
const char *findWideEntry16(UNUSED const struct mcclellan *m,
const char *wide_base, u32 wide_limit, u32 s) {
UNUSED u8 type = *(const u8 *)wide_base;
assert(type == WIDE_STATE);
const u32 entry_offset
= *(const u32 *)(wide_base
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
const char *rv = wide_base + entry_offset;
assert(rv < (const char *)m + m->length - sizeof(struct NFA));
return rv;
}
static really_inline
char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) {
u32 entry_offset
= *(const u32 *)(wide_base
+ WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));
return wide_base + entry_offset;
}
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -56,13 +56,19 @@
#include <cstring> #include <cstring>
#include <map> #include <map>
#include <memory> #include <memory>
#include <queue>
#include <set> #include <set>
#include <vector> #include <vector>
#include <boost/range/adaptor/map.hpp> #include <boost/range/adaptor/map.hpp>
#include "mcclellandump.h"
#include "util/dump_util.h"
#include "util/dump_charclass.h"
using namespace std; using namespace std;
using boost::adaptors::map_keys; using boost::adaptors::map_keys;
using boost::dynamic_bitset;
#define ACCEL_DFA_MAX_OFFSET_DEPTH 4 #define ACCEL_DFA_MAX_OFFSET_DEPTH 4
@ -82,6 +88,8 @@ namespace /* anon */ {
struct dstate_extra { struct dstate_extra {
u16 daddytaken = 0; u16 daddytaken = 0;
bool shermanState = false; bool shermanState = false;
bool wideState = false;
bool wideHead = false;
}; };
struct dfa_info { struct dfa_info {
@ -89,6 +97,8 @@ struct dfa_info {
raw_dfa &raw; raw_dfa &raw;
vector<dstate> &states; vector<dstate> &states;
vector<dstate_extra> extra; vector<dstate_extra> extra;
vector<vector<dstate_id_t>> wide_state_chain;
vector<vector<symbol_t>> wide_symbol_chain;
const u16 alpha_size; /* including special symbols */ const u16 alpha_size; /* including special symbols */
const array<u16, ALPHABET_SIZE> &alpha_remap; const array<u16, ALPHABET_SIZE> &alpha_remap;
const u16 impl_alpha_size; const u16 impl_alpha_size;
@ -112,6 +122,14 @@ struct dfa_info {
return extra[raw_id].shermanState; return extra[raw_id].shermanState;
} }
bool is_widestate(dstate_id_t raw_id) const {
return extra[raw_id].wideState;
}
bool is_widehead(dstate_id_t raw_id) const {
return extra[raw_id].wideHead;
}
size_t size(void) const { return states.size(); } size_t size(void) const { return states.size(); }
}; };
@ -124,6 +142,35 @@ u8 dfa_info::getAlphaShift() const {
} }
} }
struct state_prev_info {
vector<vector<dstate_id_t>> prev_vec;
explicit state_prev_info(size_t alpha_size) : prev_vec(alpha_size) {}
};
struct DfaPrevInfo {
u16 impl_alpha_size;
u16 state_num;
vector<state_prev_info> states;
set<dstate_id_t> accepts;
explicit DfaPrevInfo(raw_dfa &rdfa);
};
DfaPrevInfo::DfaPrevInfo(raw_dfa &rdfa)
: impl_alpha_size(rdfa.getImplAlphaSize()), state_num(rdfa.states.size()),
states(state_num, state_prev_info(impl_alpha_size)){
for (size_t i = 0; i < states.size(); i++) {
for (symbol_t sym = 0; sym < impl_alpha_size; sym++) {
dstate_id_t curr = rdfa.states[i].next[sym];
states[curr].prev_vec[sym].push_back(i);
}
if (!rdfa.states[i].reports.empty()
|| !rdfa.states[i].reports_eod.empty()) {
DEBUG_PRINTF("accept raw state: %ld\n", i);
accepts.insert(i);
}
}
}
} // namespace } // namespace
static static
@ -151,6 +198,11 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
for (size_t j = 0; j < alphaSize; j++) { for (size_t j = 0; j < alphaSize; j++) {
size_t c_prime = (i << alphaShift) + j; size_t c_prime = (i << alphaShift) + j;
// wide state has no aux structure.
if (m->has_wide && succ_table[c_prime] >= m->wide_limit) {
continue;
}
mstate_aux *aux = getAux(n, succ_table[c_prime]); mstate_aux *aux = getAux(n, succ_table[c_prime]);
if (aux->accept) { if (aux->accept) {
@ -165,7 +217,8 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
/* handle the sherman states */ /* handle the sherman states */
char *sherman_base_offset = (char *)n + m->sherman_offset; char *sherman_base_offset = (char *)n + m->sherman_offset;
for (u16 j = m->sherman_limit; j < m->state_count; j++) { u16 sherman_ceil = m->has_wide == 1 ? m->wide_limit : m->state_count;
for (u16 j = m->sherman_limit; j < sherman_ceil; j++) {
char *sherman_cur char *sherman_cur
= findMutableShermanState(sherman_base_offset, m->sherman_limit, j); = findMutableShermanState(sherman_base_offset, m->sherman_limit, j);
assert(*(sherman_cur + SHERMAN_TYPE_OFFSET) == SHERMAN_STATE); assert(*(sherman_cur + SHERMAN_TYPE_OFFSET) == SHERMAN_STATE);
@ -174,6 +227,11 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
for (u8 i = 0; i < len; i++) { for (u8 i = 0; i < len; i++) {
u16 succ_i = unaligned_load_u16((u8 *)&succs[i]); u16 succ_i = unaligned_load_u16((u8 *)&succs[i]);
// wide state has no aux structure.
if (m->has_wide && succ_i >= m->wide_limit) {
continue;
}
mstate_aux *aux = getAux(n, succ_i); mstate_aux *aux = getAux(n, succ_i);
if (aux->accept) { if (aux->accept) {
@ -187,6 +245,51 @@ void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
unaligned_store_u16((u8 *)&succs[i], succ_i); unaligned_store_u16((u8 *)&succs[i], succ_i);
} }
} }
/* handle the wide states */
if (m->has_wide) {
u32 wide_limit = m->wide_limit;
char *wide_base = (char *)n + m->wide_offset;
assert(*wide_base == WIDE_STATE);
u16 wide_number = verify_u16(info.wide_symbol_chain.size());
// traverse over wide head states.
for (u16 j = wide_limit; j < wide_limit + wide_number; j++) {
char *wide_cur
= findMutableWideEntry16(wide_base, wide_limit, j);
u16 width = *(const u16 *)(wide_cur + WIDE_WIDTH_OFFSET);
u16 *trans = (u16 *)(wide_cur + WIDE_TRANSITION_OFFSET16(width));
// check successful transition
u16 next = unaligned_load_u16((u8 *)trans);
if (next < wide_limit) {
mstate_aux *aux = getAux(n, next);
if (aux->accept) {
next |= ACCEPT_FLAG;
}
if (aux->accel_offset) {
next |= ACCEL_FLAG;
}
unaligned_store_u16((u8 *)trans, next);
}
trans++;
// check failure transition
for (symbol_t k = 0; k < alphaSize; k++) {
u16 next_k = unaligned_load_u16((u8 *)&trans[k]);
if (next_k >= wide_limit) {
continue;
}
mstate_aux *aux_k = getAux(n, next_k);
if (aux_k->accept) {
next_k |= ACCEPT_FLAG;
}
if (aux_k->accel_offset) {
next_k |= ACCEL_FLAG;
}
unaligned_store_u16((u8 *)&trans[k], next_k);
}
}
}
} }
u32 mcclellan_build_strat::max_allowed_offset_accel() const { u32 mcclellan_build_strat::max_allowed_offset_accel() const {
@ -232,6 +335,19 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
m->start_anchored = info.implId(info.raw.start_anchored); m->start_anchored = info.implId(info.raw.start_anchored);
m->start_floating = info.implId(info.raw.start_floating); m->start_floating = info.implId(info.raw.start_floating);
m->has_accel = accel_count ? 1 : 0; m->has_accel = accel_count ? 1 : 0;
m->has_wide = info.wide_state_chain.size() > 0 ? 1 : 0;
if (state_size == sizeof(u8) && m->has_wide == 1) {
// allocate 1 more byte for wide state use.
nfa->scratchStateSize += sizeof(u8);
nfa->streamStateSize += sizeof(u8);
}
if (state_size == sizeof(u16) && m->has_wide == 1) {
// allocate 2 more bytes for wide state use.
nfa->scratchStateSize += sizeof(u16);
nfa->streamStateSize += sizeof(u16);
}
if (single) { if (single) {
m->flags |= MCCLELLAN_FLAG_SINGLE; m->flags |= MCCLELLAN_FLAG_SINGLE;
@ -404,6 +520,24 @@ size_t calcShermanRegionSize(const dfa_info &info) {
return ROUNDUP_16(rv); return ROUNDUP_16(rv);
} }
static
size_t calcWideRegionSize(const dfa_info &info) {
if (info.wide_state_chain.empty()) {
return 0;
}
// wide info header
size_t rv = info.wide_symbol_chain.size() * sizeof(u32) + 4;
// wide info body
for (const auto &chain : info.wide_symbol_chain) {
rv += ROUNDUP_N(chain.size(), 2) +
(info.impl_alpha_size + 1) * sizeof(u16) + 2;
}
return ROUNDUP_16(rv);
}
static static
void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info, void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
const vector<u32> &reports, const vector<u32> &reports_eod, const vector<u32> &reports, const vector<u32> &reports_eod,
@ -418,42 +552,60 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
/* returns false on error */ /* returns false on error */
static static
bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) { bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base,
dstate_id_t *wide_limit) {
info.states[0].impl_id = 0; /* dead is always 0 */ info.states[0].impl_id = 0; /* dead is always 0 */
vector<dstate_id_t> norm; vector<dstate_id_t> norm;
vector<dstate_id_t> sherm; vector<dstate_id_t> sherm;
vector<dstate_id_t> wideHead;
vector<dstate_id_t> wideState;
if (info.size() > (1 << 16)) { if (info.size() > (1 << 16)) {
DEBUG_PRINTF("too many states\n"); DEBUG_PRINTF("too many states\n");
*sherman_base = 0; *wide_limit = 0;
return false; return false;
} }
for (u32 i = 1; i < info.size(); i++) { for (u32 i = 1; i < info.size(); i++) {
if (info.is_sherman(i)) { if (info.is_widehead(i)) {
wideHead.push_back(i);
} else if (info.is_widestate(i)) {
wideState.push_back(i);
} else if (info.is_sherman(i)) {
sherm.push_back(i); sherm.push_back(i);
} else { } else {
norm.push_back(i); norm.push_back(i);
} }
} }
dstate_id_t next_norm = 1; dstate_id_t next = 1;
for (const dstate_id_t &s : norm) { for (const dstate_id_t &s : norm) {
info.states[s].impl_id = next_norm++; DEBUG_PRINTF("[norm] mapping state %u to %u\n", s, next);
info.states[s].impl_id = next++;
} }
*sherman_base = next_norm; *sherman_base = next;
dstate_id_t next_sherman = next_norm;
for (const dstate_id_t &s : sherm) { for (const dstate_id_t &s : sherm) {
info.states[s].impl_id = next_sherman++; DEBUG_PRINTF("[sherm] mapping state %u to %u\n", s, next);
info.states[s].impl_id = next++;
}
*wide_limit = next;
for (const dstate_id_t &s : wideHead) {
DEBUG_PRINTF("[widehead] mapping state %u to %u\n", s, next);
info.states[s].impl_id = next++;
}
for (const dstate_id_t &s : wideState) {
DEBUG_PRINTF("[wide] mapping state %u to %u\n", s, next);
info.states[s].impl_id = next++;
} }
/* Check to see if we haven't over allocated our states */ /* Check to see if we haven't over allocated our states */
DEBUG_PRINTF("next sherman %u masked %u\n", next_sherman, DEBUG_PRINTF("next sherman %u masked %u\n", next,
(dstate_id_t)(next_sherman & STATE_MASK)); (dstate_id_t)(next & STATE_MASK));
return (next_sherman - 1) == ((next_sherman - 1) & STATE_MASK); return (next - 1) == ((next - 1) & STATE_MASK);
} }
static static
@ -470,12 +622,16 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
assert(alphaShift <= 8); assert(alphaShift <= 8);
u16 count_real_states; u16 count_real_states;
if (!allocateFSN16(info, &count_real_states)) { u16 wide_limit;
if (!allocateFSN16(info, &count_real_states, &wide_limit)) {
DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n", DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
info.size()); info.size());
return nullptr; return nullptr;
} }
DEBUG_PRINTF("count_real_states: %d\n", count_real_states);
DEBUG_PRINTF("non_wide_states: %d\n", wide_limit);
auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb); auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
map<dstate_id_t, AccelScheme> accel_escape_info map<dstate_id_t, AccelScheme> accel_escape_info
= info.strat.getAccelInfo(cc.grey); = info.strat.getAccelInfo(cc.grey);
@ -483,7 +639,7 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
size_t tran_size = (1 << info.getAlphaShift()) size_t tran_size = (1 << info.getAlphaShift())
* sizeof(u16) * count_real_states; * sizeof(u16) * count_real_states;
size_t aux_size = sizeof(mstate_aux) * info.size(); size_t aux_size = sizeof(mstate_aux) * wide_limit;
size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size); size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcclellan) + tran_size);
size_t accel_size = info.strat.accelSize() * accel_escape_info.size(); size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
@ -491,12 +647,24 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
+ ri->getReportListSize(), 32); + ri->getReportListSize(), 32);
size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size); size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
size_t sherman_size = calcShermanRegionSize(info); size_t sherman_size = calcShermanRegionSize(info);
size_t wide_offset = ROUNDUP_16(sherman_offset + sherman_size);
size_t total_size = sherman_offset + sherman_size; size_t wide_size = calcWideRegionSize(info);
size_t total_size = wide_offset + wide_size;
accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */ accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
DEBUG_PRINTF("aux_size %zu\n", aux_size);
DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
DEBUG_PRINTF("accel_offset %zu\n", accel_offset + sizeof(NFA));
DEBUG_PRINTF("accel_size %zu\n", accel_size);
DEBUG_PRINTF("sherman_offset %zu\n", sherman_offset);
DEBUG_PRINTF("sherman_size %zu\n", sherman_size);
DEBUG_PRINTF("wide_offset %zu\n", wide_offset);
DEBUG_PRINTF("wide_size %zu\n", wide_size);
DEBUG_PRINTF("total_size %zu\n", total_size);
auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size); auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
char *nfa_base = (char *)nfa.get(); char *nfa_base = (char *)nfa.get();
@ -511,6 +679,9 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
mstate_aux *aux = (mstate_aux *)(nfa_base + aux_offset); mstate_aux *aux = (mstate_aux *)(nfa_base + aux_offset);
mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get()); mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get());
m->wide_limit = wide_limit;
m->wide_offset = wide_offset;
/* copy in the mc header information */ /* copy in the mc header information */
m->sherman_offset = sherman_offset; m->sherman_offset = sherman_offset;
m->sherman_end = total_size; m->sherman_end = total_size;
@ -518,7 +689,7 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
/* do normal states */ /* do normal states */
for (size_t i = 0; i < info.size(); i++) { for (size_t i = 0; i < info.size(); i++) {
if (info.is_sherman(i)) { if (info.is_sherman(i) || info.is_widestate(i)) {
continue; continue;
} }
@ -556,6 +727,7 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
mstate_aux *this_aux = getAux(nfa.get(), fs); mstate_aux *this_aux = getAux(nfa.get(), fs);
assert(fs >= count_real_states); assert(fs >= count_real_states);
assert(fs < wide_limit);
char *curr_sherman_entry char *curr_sherman_entry
= sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE; = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
@ -599,6 +771,71 @@ bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
} }
} }
if (!info.wide_state_chain.empty()) {
/* do wide states using info */
u16 wide_number = verify_u16(info.wide_symbol_chain.size());
char *wide_base = nfa_base + m->wide_offset;
assert(ISALIGNED_16(wide_base));
char *wide_top = wide_base;
*(u8 *)(wide_top++) = WIDE_STATE;
wide_top = ROUNDUP_PTR(wide_top, 2);
*(u16 *)(wide_top) = wide_number;
wide_top += 2;
char *curr_wide_entry = wide_top + wide_number * sizeof(u32);
u32 *wide_offset_list = (u32 *)wide_top;
/* get the order of writing wide states */
vector<size_t> order(wide_number);
for (size_t i = 0; i < wide_number; i++) {
dstate_id_t head = info.wide_state_chain[i].front();
size_t pos = info.implId(head) - m->wide_limit;
order[pos] = i;
}
for (size_t i : order) {
vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
vector<symbol_t> &symbol_chain = info.wide_symbol_chain[i];
u16 width = verify_u16(symbol_chain.size());
*(u16 *)(curr_wide_entry + WIDE_WIDTH_OFFSET) = width;
u8 *chars = (u8 *)(curr_wide_entry + WIDE_SYMBOL_OFFSET16);
// store wide state symbol chain
for (size_t j = 0; j < width; j++) {
*(chars++) = verify_u8(symbol_chain[j]);
}
// store wide state transition table
u16 *trans = (u16 *)(curr_wide_entry
+ WIDE_TRANSITION_OFFSET16(width));
dstate_id_t tail = state_chain[width - 1];
symbol_t last = symbol_chain[width -1];
dstate_id_t tran = info.states[tail].next[last];
// 1. successful transition
*trans++ = info.implId(tran);
// 2. failure transition
for (size_t j = 0; verify_u16(j) < width - 1; j++) {
if (symbol_chain[j] != last) {
tran = info.states[state_chain[j]].next[last];
}
}
for (symbol_t sym = 0; sym < info.impl_alpha_size; sym++) {
if (sym != last) {
*trans++ = info.implId(info.states[tail].next[sym]);
}
else {
*trans++ = info.implId(tran);
}
}
*wide_offset_list++ = verify_u32(curr_wide_entry - wide_base);
curr_wide_entry = (char *)trans;
}
}
markEdges(nfa.get(), succ_table, info); markEdges(nfa.get(), succ_table, info);
if (accel_states && nfa) { if (accel_states && nfa) {
@ -844,12 +1081,16 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
if (trust_daddy_states) { if (trust_daddy_states) {
// Use the daddy already set for this state so long as it isn't already // Use the daddy already set for this state so long as it isn't already
// a Sherman state. // a Sherman state.
if (!info.is_sherman(currState.daddy)) { dstate_id_t daddy = currState.daddy;
if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
hinted.insert(currState.daddy); hinted.insert(currState.daddy);
} else { } else {
// Fall back to granddaddy, which has already been processed (due // Fall back to granddaddy, which has already been processed (due
// to BFS ordering) and cannot be a Sherman state. // to BFS ordering) and cannot be a Sherman state.
dstate_id_t granddaddy = info.states[currState.daddy].daddy; dstate_id_t granddaddy = info.states[currState.daddy].daddy;
if (info.is_widestate(granddaddy)) {
return;
}
assert(!info.is_sherman(granddaddy)); assert(!info.is_sherman(granddaddy));
hinted.insert(granddaddy); hinted.insert(granddaddy);
} }
@ -861,7 +1102,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
assert(donor < curr_id); assert(donor < curr_id);
u32 score = 0; u32 score = 0;
if (info.is_sherman(donor)) { if (info.is_sherman(donor) || info.is_widestate(donor)) {
continue; continue;
} }
@ -934,6 +1175,290 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
return false; return false;
} }
/* \brief Test for only-one-predecessor property. */
static
bool check_property1(const DfaPrevInfo &info, const u16 impl_alpha_size,
const dstate_id_t curr_id, dstate_id_t &prev_id,
symbol_t &prev_sym) {
u32 num_prev = 0;
bool test_p1 = false;
for (symbol_t sym = 0; sym < impl_alpha_size; sym++) {
num_prev += info.states[curr_id].prev_vec[sym].size();
DEBUG_PRINTF("Check symbol: %u, with its vector size: %lu\n", sym,
info.states[curr_id].prev_vec[sym].size());
if (num_prev == 1 && !test_p1) {
test_p1 = true;
prev_id = info.states[curr_id].prev_vec[sym].front(); //[0] for sure???
prev_sym = sym;
}
}
return num_prev == 1;
}
/* \brief Test for same-failure-action property. */
static
bool check_property2(const raw_dfa &rdfa, const u16 impl_alpha_size,
const dstate_id_t curr_id, const dstate_id_t prev_id,
const symbol_t curr_sym, const symbol_t prev_sym) {
const dstate &prevState = rdfa.states[prev_id];
const dstate &currState = rdfa.states[curr_id];
// Compare transition tables between currState and prevState.
u16 score = 0;
for (symbol_t sym = 0; sym < impl_alpha_size; sym++) {
if (currState.next[sym] == prevState.next[sym]
&& sym != curr_sym && sym != prev_sym) {
score++;
}
}
DEBUG_PRINTF("(Score: %u/%u)\n", score, impl_alpha_size);
// 2 cases.
if (curr_sym != prev_sym && score >= impl_alpha_size - 2
&& currState.next[prev_sym] == prevState.next[curr_sym]) {
return true;
} else if (curr_sym == prev_sym && score == impl_alpha_size - 1) {
return true;
}
return false;
}
/* \brief Check whether adding current prev_id will generate a circle.*/
static
bool check_circle(const DfaPrevInfo &info, const u16 impl_alpha_size,
const vector<dstate_id_t> &chain, const dstate_id_t id) {
const vector<vector<dstate_id_t>> &prev_vec = info.states[id].prev_vec;
const dstate_id_t tail = chain.front();
for (symbol_t sym = 0; sym < impl_alpha_size; sym++) {
auto iter = find(prev_vec[sym].begin(), prev_vec[sym].end(), tail);
if (iter != prev_vec[sym].end()) {
// Tail is one of id's predecessors, forming a circle.
return true;
}
}
return false;
}
/* \brief Returns a chain of state ids and symbols. */
static
dstate_id_t find_chain_candidate(const raw_dfa &rdfa, const DfaPrevInfo &info,
const dstate_id_t curr_id,
const symbol_t curr_sym,
vector<dstate_id_t> &temp_chain) {
//Record current id first.
temp_chain.push_back(curr_id);
const u16 size = info.impl_alpha_size;
// Stop when entering root cloud.
if (rdfa.start_anchored != DEAD_STATE
&& is_cyclic_near(rdfa, rdfa.start_anchored)
&& curr_id < size) {
return curr_id;
}
if (rdfa.start_floating != DEAD_STATE
&& curr_id >= rdfa.start_floating
&& curr_id < rdfa.start_floating + size * 3) {
return curr_id;
}
// Stop when reaching anchored or floating.
if (curr_id == rdfa.start_anchored || curr_id == rdfa.start_floating) {
return curr_id;
}
dstate_id_t prev_id = 0;
symbol_t prev_sym = ALPHABET_SIZE;
// Check the only-one-predecessor property.
if (!check_property1(info, size, curr_id, prev_id, prev_sym)) {
return curr_id;
}
assert(prev_id != 0 && prev_sym != ALPHABET_SIZE);
DEBUG_PRINTF("(P1 test passed.)\n");
// Circle testing for the prev_id that passes the P1 test.
if (check_circle(info, size, temp_chain, prev_id)) {
DEBUG_PRINTF("(A circle is found.)\n");
return curr_id;
}
// Check the same-failure-action property.
if (!check_property2(rdfa, size, curr_id, prev_id, curr_sym, prev_sym)) {
return curr_id;
}
DEBUG_PRINTF("(P2 test passed.)\n");
if (!rdfa.states[prev_id].reports.empty()
|| !rdfa.states[prev_id].reports_eod.empty()) {
return curr_id;
} else {
return find_chain_candidate(rdfa, info, prev_id, prev_sym, temp_chain);
}
}
/* \brief Always store the non-extensible chains found till now. */
static
bool store_chain_longest(vector<vector<dstate_id_t>> &candidate_chain,
vector<dstate_id_t> &temp_chain,
dynamic_bitset<> &added, bool head_is_new) {
dstate_id_t head = temp_chain.front();
u16 length = temp_chain.size();
if (head_is_new) {
DEBUG_PRINTF("This is a new chain!\n");
// Add this new chain and get it marked.
candidate_chain.push_back(temp_chain);
for (auto &id : temp_chain) {
DEBUG_PRINTF("(Marking s%u ...)\n", id);
added.set(id);
}
return true;
}
DEBUG_PRINTF("This is a longer chain!\n");
assert(!candidate_chain.empty());
auto chain = find_if(candidate_chain.begin(), candidate_chain.end(),
[&](const vector<dstate_id_t> &it) {
return it.front() == head;
});
// Not a valid head, just do nothing and return.
if (chain == candidate_chain.end()) {
return false;
}
u16 len = chain->size();
if (length > len) {
// Find out the branch node first.
size_t piv = 0;
for (; piv < length; piv++) {
if ((*chain)[piv] != temp_chain[piv]) {
break;
}
}
for (size_t j = piv + 1; j < length; j++) {
DEBUG_PRINTF("(Marking s%u (new branch) ...)\n", temp_chain[j]);
added.set(temp_chain[j]);
}
// Unmark old unuseful nodes.
// (Except the tail node, which is in working queue)
for (size_t j = piv + 1; j < verify_u16(len - 1); j++) {
DEBUG_PRINTF("(UnMarking s%u (old branch)...)\n", (*chain)[j]);
added.reset((*chain)[j]);
}
chain->assign(temp_chain.begin(), temp_chain.end());
}
return false;
}
/* \brief Generate wide_symbol_chain from wide_state_chain. */
static
void generate_symbol_chain(dfa_info &info, vector<symbol_t> &chain_tail) {
raw_dfa &rdfa = info.raw;
assert(chain_tail.size() == info.wide_state_chain.size());
for (size_t i = 0; i < info.wide_state_chain.size(); i++) {
vector<dstate_id_t> &state_chain = info.wide_state_chain[i];
vector<symbol_t> symbol_chain;
info.extra[state_chain[0]].wideHead = true;
size_t width = state_chain.size() - 1;
for (size_t j = 0; j < width; j++) {
dstate_id_t curr_id = state_chain[j];
dstate_id_t next_id = state_chain[j + 1];
// The last state of the chain doesn't belong to a wide state.
info.extra[curr_id].wideState = true;
// The tail symbol comes from vector chain_tail;
if (j == width - 1) {
symbol_chain.push_back(chain_tail[i]);
} else {
for (symbol_t sym = 0; sym < info.impl_alpha_size; sym++) {
if (rdfa.states[curr_id].next[sym] == next_id) {
symbol_chain.push_back(sym);
break;
}
}
}
}
info.wide_symbol_chain.push_back(symbol_chain);
}
}
/* \brief Find potential regions of states to be packed into wide states. */
static
void find_wide_state(dfa_info &info) {
DfaPrevInfo dinfo(info.raw);
queue<dstate_id_t> work_queue;
dynamic_bitset<> added(info.raw.states.size());
for (auto it : dinfo.accepts) {
work_queue.push(it);
added.set(it);
}
vector<symbol_t> chain_tail;
while (!work_queue.empty()) {
dstate_id_t curr_id = work_queue.front();
work_queue.pop();
DEBUG_PRINTF("Newly popped state: s%u\n", curr_id);
for (symbol_t sym = 0; sym < dinfo.impl_alpha_size; sym++) {
for (auto info_it : dinfo.states[curr_id].prev_vec[sym]) {
if (added.test(info_it)) {
DEBUG_PRINTF("(s%u already marked.)\n", info_it);
continue;
}
vector<dstate_id_t> temp_chain;
// Head is a state failing the test of the chain.
dstate_id_t head = find_chain_candidate(info.raw, dinfo,
info_it, sym,
temp_chain);
// A candidate chain should contain 8 substates at least.
if (temp_chain.size() < 8) {
DEBUG_PRINTF("(Not enough substates, continue.)\n");
continue;
}
bool head_is_new = !added.test(head);
if (head_is_new) {
added.set(head);
work_queue.push(head);
DEBUG_PRINTF("Newly pushed state: s%u\n", head);
}
reverse(temp_chain.begin(), temp_chain.end());
temp_chain.push_back(curr_id);
assert(head > 0 && head == temp_chain.front());
if (store_chain_longest(info.wide_state_chain, temp_chain,
added, head_is_new)) {
chain_tail.push_back(sym);
}
}
}
}
generate_symbol_chain(info, chain_tail);
}
bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat, bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
const CompileContext &cc, const CompileContext &cc,
bool trust_daddy_states, bool trust_daddy_states,
@ -952,11 +1477,19 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
bytecode_ptr<NFA> nfa; bytecode_ptr<NFA> nfa;
if (!using8bit) { if (!using8bit) {
if (cc.grey.allowWideStates && strat.getType() == McClellan
&& !is_triggered(raw.kind)) {
find_wide_state(info);
}
u16 total_daddy = 0; u16 total_daddy = 0;
bool any_cyclic_near_anchored_state bool any_cyclic_near_anchored_state
= is_cyclic_near(raw, raw.start_anchored); = is_cyclic_near(raw, raw.start_anchored);
for (u32 i = 0; i < info.size(); i++) { for (u32 i = 0; i < info.size(); i++) {
if (info.is_widestate(i)) {
continue;
}
find_better_daddy(info, i, using8bit, find_better_daddy(info, i, using8bit,
any_cyclic_near_anchored_state, any_cyclic_near_anchored_state,
trust_daddy_states, cc.grey); trust_daddy_states, cc.grey);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -60,6 +60,7 @@ public:
u32 max_allowed_offset_accel() const override; u32 max_allowed_offset_accel() const override;
u32 max_stop_char() const override; u32 max_stop_char() const override;
u32 max_floating_stop_char() const override; u32 max_floating_stop_char() const override;
DfaType getType() const override { return McClellan; }
private: private:
raw_dfa &rdfa; raw_dfa &rdfa;

View File

@ -275,7 +275,8 @@ void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
dumpDotPreambleDfa(f); dumpDotPreambleDfa(f);
for (u16 i = 1; i < m->state_count; i++) { u16 sherman_ceil = m->has_wide == 1 ? m->wide_limit : m->state_count;
for (u16 i = 1; i < sherman_ceil; i++) {
describeNode(nfa, m, i, f); describeNode(nfa, m, i, f);
u16 t[ALPHABET_SIZE]; u16 t[ALPHABET_SIZE];
@ -314,7 +315,8 @@ void dumpAccelMasks(FILE *f, const mcclellan *m, const mstate_aux *aux) {
fprintf(f, "Acceleration\n"); fprintf(f, "Acceleration\n");
fprintf(f, "------------\n"); fprintf(f, "------------\n");
for (u16 i = 0; i < m->state_count; i++) { u16 sherman_ceil = m->has_wide == 1 ? m->wide_limit : m->state_count;
for (u16 i = 0; i < sherman_ceil; i++) {
if (!aux[i].accel_offset) { if (!aux[i].accel_offset) {
continue; continue;
} }
@ -360,7 +362,8 @@ void dumpCommonHeader(FILE *f, const mcclellan *m) {
static static
void dumpTransitions(FILE *f, const NFA *nfa, const mcclellan *m, void dumpTransitions(FILE *f, const NFA *nfa, const mcclellan *m,
const mstate_aux *aux) { const mstate_aux *aux) {
for (u16 i = 0; i < m->state_count; i++) { u16 sherman_ceil = m->has_wide == 1 ? m->wide_limit : m->state_count;
for (u16 i = 0; i < sherman_ceil; i++) {
fprintf(f, "%05hu", i); fprintf(f, "%05hu", i);
if (aux[i].accel_offset) { if (aux[i].accel_offset) {
dumpAccelText(f, (const union AccelAux *)((const char *)m + dumpAccelText(f, (const union AccelAux *)((const char *)m +

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -61,6 +61,7 @@ public:
u32 max_allowed_offset_accel() const override; u32 max_allowed_offset_accel() const override;
u32 max_stop_char() const override; u32 max_stop_char() const override;
u32 max_floating_stop_char() const override; u32 max_floating_stop_char() const override;
DfaType getType() const override { return Sheng; }
private: private:
raw_dfa &rdfa; raw_dfa &rdfa;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -185,6 +185,7 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
return false; return false;
} }
vis.lit.set_pure();
const ue2_literal &lit = vis.lit; const ue2_literal &lit = vis.lit;
if (lit.empty()) { if (lit.empty()) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -238,7 +238,11 @@ hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
assert(id && id < t->size); // id is an offset into bytecode assert(id && id < t->size); // id is an offset into bytecode
const u64a som = 0; const u64a som = 0;
const u8 flags = 0; const u8 flags = 0;
return roseRunProgram_i(t, scratch, id, som, end, flags); if (!scratch->pure) {
return roseRunProgram(t, scratch, id, som, end, flags);
} else {
return roseRunProgram_l(t, scratch, id, som, end, flags);
}
} }
static rose_inline static rose_inline

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -2843,9 +2843,34 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id, DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id,
dumpString(lit.s).c_str()); dumpString(lit.s).c_str());
auto &fi = frag_info[getFragment(lit)];
fi.lit_ids.push_back(lit_id); /** 0:/xxabcdefgh/ */
fi.groups |= groups; /** 1:/yyabcdefgh/ */
/** 2:/yyabcdefgh.+/ */
// Above 3 patterns should firstly convert into RoseLiteralMap with
// 2 elements ("xxabcdefgh" and "yyabcdefgh"), then convert into
// LitFragment with 1 element ("abcdefgh"). Special care should be
// taken to handle the 'pure' flag during the conversion.
rose_literal_id lit_frag = getFragment(lit);
auto it = frag_info.find(lit_frag);
if (it != frag_info.end()) {
if (!lit_frag.s.get_pure() && it->first.s.get_pure()) {
struct FragmentInfo f_info = it->second;
f_info.lit_ids.push_back(lit_id);
f_info.groups |= groups;
frag_info.erase(it->first);
frag_info.emplace(lit_frag, f_info);
} else {
it->second.lit_ids.push_back(lit_id);
it->second.groups |= groups;
}
} else {
struct FragmentInfo f_info;
f_info.lit_ids.push_back(lit_id);
f_info.groups |= groups;
frag_info.emplace(lit_frag, f_info);
}
} }
for (auto &m : frag_info) { for (auto &m : frag_info) {

View File

@ -115,9 +115,9 @@ class RoseGraphWriter {
public: public:
RoseGraphWriter(const RoseBuildImpl &b_in, const map<u32, u32> &frag_map_in, RoseGraphWriter(const RoseBuildImpl &b_in, const map<u32, u32> &frag_map_in,
const map<left_id, u32> &lqm_in, const map<left_id, u32> &lqm_in,
const map<suffix_id, u32> &sqm_in, const RoseEngine *t_in) const map<suffix_id, u32> &sqm_in)
: frag_map(frag_map_in), leftfix_queue_map(lqm_in), : frag_map(frag_map_in), leftfix_queue_map(lqm_in),
suffix_queue_map(sqm_in), build(b_in), t(t_in) { suffix_queue_map(sqm_in), build(b_in) {
for (const auto &m : build.ghost) { for (const auto &m : build.ghost) {
ghost.insert(m.second); ghost.insert(m.second);
} }
@ -273,7 +273,6 @@ private:
const map<left_id, u32> &leftfix_queue_map; const map<left_id, u32> &leftfix_queue_map;
const map<suffix_id, u32> &suffix_queue_map; const map<suffix_id, u32> &suffix_queue_map;
const RoseBuildImpl &build; const RoseBuildImpl &build;
const RoseEngine *t;
}; };
} // namespace } // namespace
@ -313,8 +312,7 @@ void dumpRoseGraph(const RoseBuildImpl &build, const RoseEngine *t,
ofstream os(ss.str()); ofstream os(ss.str());
auto frag_map = makeFragMap(fragments); auto frag_map = makeFragMap(fragments);
RoseGraphWriter writer(build, frag_map, leftfix_queue_map, suffix_queue_map, RoseGraphWriter writer(build, frag_map, leftfix_queue_map, suffix_queue_map);
t);
writeGraphviz(os, build.g, writer, get(boost::vertex_index, build.g)); writeGraphviz(os, build.g, writer, get(boost::vertex_index, build.g));
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -340,7 +340,14 @@ public:
std::pair<u32, bool> insert(const rose_literal_id &lit) { std::pair<u32, bool> insert(const rose_literal_id &lit) {
auto it = lits_index.find(lit); auto it = lits_index.find(lit);
if (it != lits_index.end()) { if (it != lits_index.end()) {
return {it->second, false}; u32 idx = it->second;
auto &l = lits.at(idx);
if (!lit.s.get_pure() && l.s.get_pure()) {
lits_index.erase(l);
l.s.unset_pure();
lits_index.emplace(l, idx);
}
return {idx, false};
} }
u32 id = verify_u32(lits.size()); u32 id = verify_u32(lits.size());
lits.push_back(lit); lits.push_back(lit);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -727,6 +727,7 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
const auto &s_final = lit_final.get_string(); const auto &s_final = lit_final.get_string();
bool nocase = lit_final.any_nocase(); bool nocase = lit_final.any_nocase();
bool pure = f.s.get_pure();
DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n", DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, cmp=%s\n",
f.fragment_id, escapeString(s_final).c_str(), (int)nocase, f.fragment_id, escapeString(s_final).c_str(), (int)nocase,
@ -740,7 +741,7 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
const auto &groups = f.groups; const auto &groups = f.groups;
mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id, mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id,
groups, msk, cmp); groups, msk, cmp, pure);
} }
static static

View File

@ -33,6 +33,7 @@
#include "rose_common.h" #include "rose_common.h"
#include "rose_internal.h" #include "rose_internal.h"
#include "stream_long_lit_hash.h" #include "stream_long_lit_hash.h"
#include "util/compare.h"
#include "util/copybytes.h" #include "util/copybytes.h"
static really_inline static really_inline

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -141,6 +141,7 @@ void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose,
s->deduper.current_report_offset = ~0ULL; s->deduper.current_report_offset = ~0ULL;
s->deduper.som_log_dirty = 1; /* som logs have not been cleared */ s->deduper.som_log_dirty = 1; /* som logs have not been cleared */
s->fdr_conf = NULL; s->fdr_conf = NULL;
s->pure = 0;
// Rose program execution (used for some report paths) depends on these // Rose program execution (used for some report paths) depends on these
// values being initialised. // values being initialised.
@ -445,6 +446,7 @@ done_scan:
scratch); scratch);
} }
set_retval:
if (rose->flushCombProgramOffset) { if (rose->flushCombProgramOffset) {
if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) { if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) {
unmarkScratchInUse(scratch); unmarkScratchInUse(scratch);
@ -452,7 +454,6 @@ done_scan:
} }
} }
set_retval:
DEBUG_PRINTF("done. told_to_stop_matching=%d\n", DEBUG_PRINTF("done. told_to_stop_matching=%d\n",
told_to_stop_matching(scratch)); told_to_stop_matching(scratch));
hs_error_t rv = told_to_stop_matching(scratch) ? HS_SCAN_TERMINATED hs_error_t rv = told_to_stop_matching(scratch) ? HS_SCAN_TERMINATED
@ -934,12 +935,6 @@ hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data,
} }
} }
if (rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) {
if (roseRunFlushCombProgram(rose, scratch, ~0ULL) == MO_HALT_MATCHING) {
scratch->core_info.status |= STATUS_TERMINATED;
}
}
setStreamStatus(state, scratch->core_info.status); setStreamStatus(state, scratch->core_info.status);
if (likely(!can_stop_matching(scratch))) { if (likely(!can_stop_matching(scratch))) {
@ -994,6 +989,13 @@ hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
unmarkScratchInUse(scratch); unmarkScratchInUse(scratch);
} }
if (id->rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) {
if (roseRunFlushCombProgram(id->rose, scratch, ~0ULL)
== MO_HALT_MATCHING) {
scratch->core_info.status |= STATUS_TERMINATED;
}
}
hs_stream_free(id); hs_stream_free(id);
return HS_SUCCESS; return HS_SUCCESS;
@ -1019,6 +1021,13 @@ hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags,
unmarkScratchInUse(scratch); unmarkScratchInUse(scratch);
} }
if (id->rose->flushCombProgramOffset && !told_to_stop_matching(scratch)) {
if (roseRunFlushCombProgram(id->rose, scratch, ~0ULL)
== MO_HALT_MATCHING) {
scratch->core_info.status |= STATUS_TERMINATED;
}
}
// history already initialised // history already initialised
init_stream(id, id->rose, 0); init_stream(id, id->rose, 0);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -137,6 +137,7 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
s->scratchSize = alloc_size; s->scratchSize = alloc_size;
s->scratch_alloc = (char *)s_tmp; s->scratch_alloc = (char *)s_tmp;
s->fdr_conf = NULL; s->fdr_conf = NULL;
s->pure = 0;
// each of these is at an offset from the previous // each of these is at an offset from the previous
char *current = (char *)s + sizeof(*s); char *current = (char *)s + sizeof(*s);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2018, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -208,6 +208,7 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
u64a *fdr_conf; /**< FDR confirm value */ u64a *fdr_conf; /**< FDR confirm value */
u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches
* in buffer */ * in buffer */
u8 pure; /**< indicator of pure-literal or cutting-literal */
}; };
/* array of fatbit ptr; TODO: why not an array of fatbits? */ /* array of fatbit ptr; TODO: why not an array of fatbits? */

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2019, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -211,10 +211,17 @@ public:
size_t hash() const; size_t hash() const;
void set_pure() { pure = true; }
void unset_pure() { pure = false; }
bool get_pure() const { return pure; }
/* TODO: consider existing member functions possibly related with pure. */
private: private:
friend const_iterator; friend const_iterator;
std::string s; std::string s;
boost::dynamic_bitset<> nocase; boost::dynamic_bitset<> nocase;
bool pure = false; /**< born from cutting or not (pure literal). */
}; };
/// Return a reversed copy of this literal. /// Return a reversed copy of this literal.

View File

@ -1065,6 +1065,9 @@ int HS_CDECL main(int argc, char *argv[]) {
} catch (const SqlFailure &f) { } catch (const SqlFailure &f) {
cerr << f.message << '\n'; cerr << f.message << '\n';
return -1; return -1;
} catch (const std::runtime_error &e) {
cerr << "Internal error: " << e.what() << '\n';
return -1;
} }
return 0; return 0;

View File

@ -64,7 +64,6 @@ set_source_files_properties(${hscollider_SOURCES} PROPERTIES
INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR})
add_executable(hscollider ${hscollider_SOURCES}) add_executable(hscollider ${hscollider_SOURCES})
add_dependencies(hscollider ragel_ColliderCorporaParser) add_dependencies(hscollider ragel_ColliderCorporaParser)
add_dependencies(hscollider pcre)
if(NOT WIN32) if(NOT WIN32)
if (BUILD_CHIMERA) if (BUILD_CHIMERA)

View File

@ -61,6 +61,7 @@
#include <cstdlib> #include <cstdlib>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <iterator>
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <string> #include <string>

View File

@ -477,6 +477,7 @@ TEST(HybridArgChecks, AllocScratchBogusScratch) {
makeDatabase(&db); makeDatabase(&db);
ch_scratch_t *blah = (ch_scratch_t *)malloc(100); ch_scratch_t *blah = (ch_scratch_t *)malloc(100);
ASSERT_TRUE(blah != nullptr);
memset(blah, 0xf0, 100); memset(blah, 0xf0, 100);
ch_error_t err = ch_alloc_scratch(db, &blah); ch_error_t err = ch_alloc_scratch(db, &blah);
ASSERT_EQ(CH_INVALID, err); ASSERT_EQ(CH_INVALID, err);
@ -536,6 +537,7 @@ TEST(HybridArgChecks, DatabaseSizeNoDatabase) {
TEST(HybridArgChecks, CloneBadScratch) { TEST(HybridArgChecks, CloneBadScratch) {
// Try cloning the scratch // Try cloning the scratch
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
ch_scratch_t *cloned = nullptr; ch_scratch_t *cloned = nullptr;
ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; ch_scratch_t *scratch = (ch_scratch_t *)local_garbage;
@ -550,6 +552,7 @@ TEST(HybridArgChecks, ScanBadScratch) {
makeDatabase(&db); makeDatabase(&db);
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; ch_scratch_t *scratch = (ch_scratch_t *)local_garbage;

View File

@ -1370,6 +1370,7 @@ TEST(HyperscanArgChecks, AllocScratchBogusScratch) {
ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(HS_SUCCESS, err);
ASSERT_TRUE(db != nullptr); ASSERT_TRUE(db != nullptr);
hs_scratch_t *blah = (hs_scratch_t *)malloc(100); hs_scratch_t *blah = (hs_scratch_t *)malloc(100);
ASSERT_TRUE(blah != nullptr);
memset(blah, 0xf0, 100); memset(blah, 0xf0, 100);
err = hs_alloc_scratch(db, &blah); err = hs_alloc_scratch(db, &blah);
ASSERT_EQ(HS_INVALID, err); ASSERT_EQ(HS_INVALID, err);
@ -2034,6 +2035,7 @@ TEST(HyperscanArgChecks, ScratchSizeBadScratch) {
TEST(HyperscanArgChecks, CloneBadScratch) { TEST(HyperscanArgChecks, CloneBadScratch) {
// Try cloning the scratch // Try cloning the scratch
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
hs_scratch_t *cloned = nullptr; hs_scratch_t *cloned = nullptr;
hs_scratch_t *scratch = (hs_scratch_t *)local_garbage; hs_scratch_t *scratch = (hs_scratch_t *)local_garbage;
@ -2052,6 +2054,7 @@ TEST(HyperscanArgChecks, ScanBadScratch) {
ASSERT_TRUE(db != nullptr); ASSERT_TRUE(db != nullptr);
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
hs_scratch_t *scratch = (hs_scratch_t *)local_garbage; hs_scratch_t *scratch = (hs_scratch_t *)local_garbage;
@ -2072,6 +2075,7 @@ TEST(HyperscanArgChecks, ScanStreamBadScratch) {
ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(HS_SUCCESS, err);
ASSERT_TRUE(db != nullptr); ASSERT_TRUE(db != nullptr);
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
hs_scratch_t *scratch = (hs_scratch_t *)local_garbage; hs_scratch_t *scratch = (hs_scratch_t *)local_garbage;
@ -2107,6 +2111,7 @@ TEST(HyperscanArgChecks, ResetStreamBadScratch) {
ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(HS_SUCCESS, err);
ASSERT_TRUE(db != nullptr); ASSERT_TRUE(db != nullptr);
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
hs_scratch_t *scratch = (hs_scratch_t *)local_garbage; hs_scratch_t *scratch = (hs_scratch_t *)local_garbage;
@ -2142,6 +2147,7 @@ TEST(HyperscanArgChecks, ScanVectorBadScratch) {
ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(HS_SUCCESS, err);
ASSERT_TRUE(db != nullptr); ASSERT_TRUE(db != nullptr);
void *local_garbage = malloc(sizeof(garbage)); void *local_garbage = malloc(sizeof(garbage));
ASSERT_TRUE(local_garbage != nullptr);
memcpy(local_garbage, garbage, sizeof(garbage)); memcpy(local_garbage, garbage, sizeof(garbage));
hs_scratch_t *scratch = (hs_scratch_t *)local_garbage; hs_scratch_t *scratch = (hs_scratch_t *)local_garbage;