From 1875d55cf1d047da0a74b2df8311225ace604dcd Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Thu, 12 Jan 2017 12:35:54 +1100 Subject: [PATCH] parser: add initial parser for control verbs This more reliably handles control verbs like (*UTF8) that can only happen at the start of the pattern, and allows them in any ordering. --- CMakeLists.txt | 9 +++ src/parser/Parser.h | 4 +- src/parser/Parser.rl | 43 ++++++------ src/parser/control_verbs.h | 46 ++++++++++++ src/parser/control_verbs.rl | 121 ++++++++++++++++++++++++++++++++ unit/hyperscan/bad_patterns.txt | 9 ++- 6 files changed, 208 insertions(+), 24 deletions(-) create mode 100644 src/parser/control_verbs.h create mode 100644 src/parser/control_verbs.rl diff --git a/CMakeLists.txt b/CMakeLists.txt index 27d3e02b..4ec1f9e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -459,6 +459,13 @@ set_source_files_properties( ragelmaker(src/parser/Parser.rl) +set_source_files_properties( + ${CMAKE_BINARY_DIR}/src/parser/control_verbs.cpp + PROPERTIES + COMPILE_FLAGS "${RAGEL_C_FLAGS}") + +ragelmaker(src/parser/control_verbs.rl) + SET(hs_HEADERS src/hs.h src/hs_common.h @@ -891,6 +898,8 @@ SET (hs_SRCS src/parser/buildstate.h src/parser/check_refs.cpp src/parser/check_refs.h + src/parser/control_verbs.cpp + src/parser/control_verbs.h src/parser/parse_error.cpp src/parser/parse_error.h src/parser/parser_util.cpp diff --git a/src/parser/Parser.h b/src/parser/Parser.h index 45c3ac7a..a034a18f 100644 --- a/src/parser/Parser.h +++ b/src/parser/Parser.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -69,7 +69,7 @@ struct ParseMode { * * This call will throw a ParseError on failure. */ -std::unique_ptr parse(const char *const ptr, ParseMode &mode); +std::unique_ptr parse(const char *ptr, ParseMode &mode); } // namespace ue2 diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 53130ddf..dfa0beda 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,6 +34,7 @@ /* Parser.cpp is a built source, may not be in same dir as parser files */ #include "parser/check_refs.h" +#include "parser/control_verbs.h" #include "parser/ComponentAlternation.h" #include "parser/ComponentAssertion.h" #include "parser/ComponentAtomicGroup.h" @@ -549,27 +550,23 @@ unichar readUtf8CodePoint4c(const u8 *ts) { ############################################################# readVerb := |* 'UTF8)' => { - if (ts != ptr + 2) { - throw LocatedParseError("(*UTF8) must be at start of " - "expression, encountered"); - } - mode.utf8 = true; - globalMode.utf8 = true; /* once you unicode, you can't stop */ - ucp_start_p = te; /* (*UCP) can appear after us */ - fret; + throw LocatedParseError("(*UTF8) must be at start of " + "expression, encountered"); + }; + 'UTF)' => { + throw LocatedParseError("(*UTF) must be at start of " + "expression, encountered"); }; 'UCP)' => { - if (ts != ucp_start_p + 2) { - throw LocatedParseError("(*UCP) must be at start of " - "expression, encountered"); - } - mode.ucp = true; - globalMode.ucp = true; /* once you unicode, you can't stop */ - fret; + throw LocatedParseError("(*UCP) must be at start of " + "expression, encountered"); }; 'UTF16)' => { throw LocatedParseError("(*UTF16) not supported"); }; + 'UTF32)' => { + throw LocatedParseError("(*UTF32) not supported"); + }; any => { throw LocatedParseError("Unknown control verb"); }; @@ -1834,10 +1831,18 @@ unichar readUtf8CodePoint4c(const u8 *ts) { %% write data nofinal; /** \brief Main parser call, returns root Component or nullptr. */ -unique_ptr parse(const char *const c_ptr, ParseMode &globalMode) { - const u8 * const ptr = (const u8 * const)c_ptr; +unique_ptr parse(const char *c_ptr, ParseMode &globalMode) { + assert(c_ptr); + + const u8 *ptr = (const u8 *const)c_ptr; const u8 *p = ptr; const u8 *pe = ptr + strlen(c_ptr); + + // First, read the control verbs, set any global mode flags and move the + // ptr forward. + p = (const u8 *)read_control_verbs((const char *)p, (const char *)pe, + globalMode); + const u8 *eof = pe; int cs; UNUSED int act; @@ -1891,8 +1896,6 @@ unique_ptr parse(const char *const c_ptr, ParseMode &globalMode) { // Location at which the current character class began. const u8 *currentClsBegin = p; - const u8 *ucp_start_p = p; /* for (*UCP) verb */ - // We throw exceptions on various parsing failures beyond this point: we // use a try/catch block here to clean up our allocated memory before we // re-throw the exception to the caller. diff --git a/src/parser/control_verbs.h b/src/parser/control_verbs.h new file mode 100644 index 00000000..9cf5b116 --- /dev/null +++ b/src/parser/control_verbs.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Parser for control verbs that can occur at the beginning of a pattern. + */ + +#ifndef CONTROL_VERBS_H +#define CONTROL_VERBS_H + +namespace ue2 { + +struct ParseMode; + +const char *read_control_verbs(const char *ptr, const char *end, + ParseMode &mode); + +} // namespace ue2 + +#endif // CONTROL_VERBS_H diff --git a/src/parser/control_verbs.rl b/src/parser/control_verbs.rl new file mode 100644 index 00000000..7eb9b86c --- /dev/null +++ b/src/parser/control_verbs.rl @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Parser for control verbs that can occur at the beginning of a pattern. + */ + +#include "parser/control_verbs.h" + +#include "parser/Parser.h" +#include "parser/parse_error.h" + +#include +#include + +using namespace std; + +namespace ue2 { + +const char *read_control_verbs(const char *ptr, const char *end, + ParseMode &mode) { + const char *p = ptr; + const char *pe = end; + const char *eof = pe; + const char *ts, *te; + int cs; + UNUSED int act; + + %%{ + machine ControlVerbs; + + # Verbs that we recognise but do not support. + unhandledVerbs = '(*' ( + 'LIMIT_MATCH=' [0-9]+ | + 'LIMIT_RECURSION=' [0-9]+ | + 'NO_AUTO_POSSESS' | + 'NO_START_OPT' | + 'UTF16' | + 'UTF32' | + 'CR' | + 'LF' | + 'CRLF' | + 'ANYCRLF' | + 'ANY' | + 'BSR_ANYCRLF' | + 'BSR_UNICODE' + ) . ')'; + + main := |* + '(*UTF8)' | '(*UTF)' => { + mode.utf8 = true; + }; + + '(*UCP)' => { + mode.ucp = true; + }; + + unhandledVerbs => { + ostringstream str; + str << "Unsupported control verb " << string(ts, te - ts); + throw LocatedParseError(str.str()); + }; + + '(*' [^)]+ ')' => { + ostringstream str; + str << "Unknown control verb " << string(ts, te - ts); + throw LocatedParseError(str.str()); + }; + + # Anything else means we're done. + any => { + fhold; + fbreak; + }; + *|; + + write data; + write init; + }%% + + try { + %% write exec; + } catch (LocatedParseError &error) { + if (ts >= ptr && ts <= pe) { + error.locate(ts - ptr); + } else { + error.locate(0); + } + throw; + } + + return p; +} + +} // namespace ue2 diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 37307bc9..52287ec0 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -90,8 +90,8 @@ 91:/a\owibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1. 92:/a\o{wibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1. 93:/a\o{777}/ #Value in \o{...} sequence is too large at index 1. -94:/(*UTF16)foo/ #(*UTF16) not supported at index 2. -95:/(*BSR_UNICODE)abc/ #Unknown control verb at index 2. +94:/(*UTF16)foo/ #Unsupported control verb (*UTF16) at index 0. +95:/(*BSR_UNICODE)abc/ #Unsupported control verb (*BSR_UNICODE) at index 0. 96:/a+(*SKIP)b/ #Unknown control verb at index 4. 97:/foo(*/ #Invalid repeat at index 4. 98:/[:\]:]/ #POSIX named classes are only supported inside a class at index 0. @@ -130,3 +130,8 @@ 133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2. 134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2. 135:/[^\D\d]/8W #Pattern can never match. +136:/(*LIMIT_MATCH=1000)foobar/ #Unsupported control verb (*LIMIT_MATCH=1000) at index 0. +137:/(*UTF32)foobar/ #Unsupported control verb (*UTF32) at index 0. +138:/(*UNKNOWNVERB)foobar/ #Unknown control verb (*UNKNOWNVERB) at index 0. +139:/foo(*UTF8)bar/ #(*UTF8) must be at start of expression, encountered at index 5. +140:/(?i)(*UTF8)foobar/ #(*UTF8) must be at start of expression, encountered at index 6.