mirror of
https://github.com/openappsec/openappsec.git
synced 2025-06-28 16:41:02 +03:00
654 lines
24 KiB
C++
Executable File
654 lines
24 KiB
C++
Executable File
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// #define WAF2_LOGGING_ENABLE
|
|
|
|
#include "Waf2Regex.h"
|
|
#include "debug.h"
|
|
#include <vector>
|
|
#include <algorithm>
|
|
|
|
USE_DEBUG_FLAG(D_WAAP_REGEX);
|
|
|
|
// SingleRegex
|
|
|
|
SingleRegex::SingleRegex(
|
|
const std::string& pattern,
|
|
bool& error,
|
|
const std::string& regexName,
|
|
bool bNoRegex,
|
|
const std::string ®exMatchName,
|
|
const std::string ®exMatchValue)
|
|
:
|
|
m_re(NULL),
|
|
m_matchData(NULL),
|
|
m_regexName(regexName),
|
|
m_noRegex(bNoRegex),
|
|
m_regexMatchName(regexMatchName),
|
|
m_regexMatchValue(regexMatchValue)
|
|
{
|
|
dbgTrace(D_WAAP_REGEX) << "Create SingleRegex '" << m_regexName << "' PATTERN: '" <<
|
|
std::string(pattern.data(), pattern.size()) << "'";
|
|
|
|
if (error) {
|
|
// Skip initialization if already in error condition
|
|
dbgError(D_WAAP_REGEX) << "Skip compiling regex: " << m_regexName << " (single) due to previous error";
|
|
return;
|
|
}
|
|
|
|
int errorCode;
|
|
size_t errorOffset;
|
|
m_re = pcre2_compile(
|
|
reinterpret_cast<PCRE2_SPTR>(pattern.data()),
|
|
pattern.size(),
|
|
0,
|
|
&errorCode,
|
|
&errorOffset,
|
|
NULL
|
|
);
|
|
|
|
if (pcre2_jit_compile(m_re, PCRE2_JIT_COMPLETE) < 0) {
|
|
dbgError(D_WAAP_REGEX) << "pcre2_jit_compile failed for regex: " << m_regexName << " (single)";
|
|
error = true;
|
|
}
|
|
|
|
if (m_re == NULL) {
|
|
PCRE2_UCHAR errMessage[4096];
|
|
pcre2_get_error_message(errorCode, errMessage, sizeof(errMessage));
|
|
dbgError(D_WAAP_REGEX) << "pcre2_compile failed: error (" << errorCode << "), " << errMessage <<
|
|
", at offset " << errorOffset << " in pattern (single) of regex " << m_regexName << ".";
|
|
dbgError(D_WAAP_REGEX) << "pattern: '" << pattern.c_str() << "'";
|
|
error = true;
|
|
return;
|
|
}
|
|
|
|
// Create matchData object that is ready to receive any possible match from m_re
|
|
m_matchData = pcre2_match_data_create_from_pattern(m_re, NULL);
|
|
|
|
if (m_matchData == NULL) {
|
|
dbgError(D_WAAP_REGEX) << "pcre2_compile failed to allocate matchData. pattern: '" <<
|
|
std::string(pattern.data(), pattern.size()) << "'";
|
|
pcre2_code_free(m_re);
|
|
m_re = NULL;
|
|
return;
|
|
}
|
|
|
|
// Get info about compiled pattern
|
|
pcre2_pattern_info(m_re, PCRE2_INFO_CAPTURECOUNT, &m_captureGroupsCount);
|
|
PCRE2_SPTR nameTable;
|
|
uint32_t nameCount;
|
|
uint32_t nameEntrySize;
|
|
pcre2_pattern_info(m_re, PCRE2_INFO_NAMECOUNT, &nameCount);
|
|
pcre2_pattern_info(m_re, PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
|
|
pcre2_pattern_info(m_re, PCRE2_INFO_NAMETABLE, &nameTable);
|
|
|
|
// Allocate enough items for group names to be indexed by capture group index
|
|
// Note that number capture groups are numbered starting from 1. Group "0" is for the "whole match"
|
|
m_captureNames.resize(m_captureGroupsCount + 1);
|
|
|
|
for (uint32_t i = 0; i < nameCount; i++) {
|
|
PCRE2_SPTR nameTableEntry = nameTable + i * nameEntrySize;
|
|
// According to pcre2 docs, each entry struct starts with 16-bit capture index (big-endian). Consume it.
|
|
uint16_t captureIndex = (nameTableEntry[0] << 8) + nameTableEntry[1];
|
|
// Note that capture group indices are numbered starting from 1. Group "0" is for the "whole match"
|
|
nameTableEntry += sizeof(uint16_t);
|
|
// After the index comes zero-terminated capture name. Consume it too.
|
|
m_captureNames[captureIndex] = (char*)nameTableEntry;
|
|
}
|
|
}
|
|
|
|
SingleRegex::~SingleRegex() {
|
|
if (m_matchData) {
|
|
pcre2_match_data_free(m_matchData);
|
|
}
|
|
|
|
if (m_re) {
|
|
pcre2_code_free(m_re);
|
|
}
|
|
}
|
|
|
|
bool SingleRegex::hasMatch(const std::string& s) const {
|
|
int rc = pcre2_match(
|
|
m_re, // code
|
|
reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), // subject/subject length
|
|
0, // start offset
|
|
0, // options
|
|
m_matchData,
|
|
NULL // match_context
|
|
);
|
|
|
|
if (rc <= 0) {
|
|
if (rc != PCRE2_ERROR_NOMATCH) {
|
|
PCRE2_UCHAR errmsg[4096];
|
|
pcre2_get_error_message(rc, errmsg, sizeof(errmsg) - 1);
|
|
dbgDebug(D_WAAP_REGEX) << "SingleRegex['" << m_regexName << "']::hasMatch " <<
|
|
"failed with error code: " << rc << " ('" << errmsg << "')";
|
|
}
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t SingleRegex::findAllMatches(const std::string& s, std::vector<RegexMatch>& matches, size_t maxMatches) const {
|
|
size_t matchesCount = 0;
|
|
|
|
// Optimized regex that always immediately reports a "simulated" match without spending time to do a scan
|
|
if (m_noRegex) {
|
|
RegexMatch match;
|
|
// Group 0 is "whole match" must always be present and have no name
|
|
match.groups.push_back(
|
|
RegexMatch::MatchGroup(
|
|
1,
|
|
"",
|
|
m_regexMatchValue
|
|
)
|
|
);
|
|
// Group 1 is "specific match" must be present and have a name
|
|
match.groups.push_back(
|
|
RegexMatch::MatchGroup(
|
|
2,
|
|
m_regexMatchName,
|
|
m_regexMatchValue
|
|
)
|
|
);
|
|
matches.push_back(match);
|
|
matchesCount++;
|
|
return matchesCount;
|
|
}
|
|
|
|
PCRE2_SIZE startOffset = 0;
|
|
|
|
do {
|
|
int rc = pcre2_match(
|
|
m_re, // code
|
|
reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), // subject/subject length
|
|
startOffset, // start offset
|
|
0, // options
|
|
m_matchData,
|
|
NULL // match_context
|
|
);
|
|
|
|
if (rc <= 0) {
|
|
if (rc != PCRE2_ERROR_NOMATCH) {
|
|
PCRE2_UCHAR errmsg[4096];
|
|
pcre2_get_error_message(rc, errmsg, sizeof(errmsg) - 1);
|
|
dbgDebug(D_WAAP_REGEX) << "SingleRegex['" << m_regexName << "']::findAllMatches " <<
|
|
"failed with error code: " << rc << " ('" << errmsg << "')";
|
|
}
|
|
break;
|
|
}
|
|
|
|
int highestMatchedGroupIndex = rc;
|
|
|
|
// Get pointer to array of offsets into s, and its size
|
|
uint32_t ovCount = pcre2_get_ovector_count(m_matchData);
|
|
PCRE2_SIZE* ov = pcre2_get_ovector_pointer(m_matchData);
|
|
|
|
RegexMatch match;
|
|
match.groups.reserve(ovCount);
|
|
|
|
dbgTrace(D_WAAP_REGEX) << "regex '" << m_regexName << "', captureGroupsCount = " <<
|
|
m_captureGroupsCount << ". ovCount = " << ovCount << "; highestMatchedGroupIndex = " <<
|
|
highestMatchedGroupIndex;
|
|
|
|
// ov is vector of ovCount pairs of PCRE2_SIZE values.
|
|
// First entry in pair is offset of start of the match (in s),
|
|
// second entry is offset of character one after end of the match.
|
|
// Walk over all matches and fill them here (-1 because first one isn't included in ovCount).
|
|
for (int groupIndex = 1; groupIndex < highestMatchedGroupIndex; ++groupIndex) {
|
|
PCRE2_SIZE rangeStart = ov[groupIndex * 2];
|
|
PCRE2_SIZE rangeEnd = ov[groupIndex * 2 + 1];
|
|
|
|
// Skip matches that are not set
|
|
if (rangeStart == PCRE2_UNSET || rangeEnd == PCRE2_UNSET) {
|
|
continue;
|
|
}
|
|
|
|
dbgTrace(D_WAAP_REGEX) << "groupIndex=" << groupIndex << " ['" << m_captureNames[groupIndex] <<
|
|
"']: range " << rangeStart << " -> " << rangeEnd;
|
|
match.groups.push_back(
|
|
RegexMatch::MatchGroup(
|
|
groupIndex,
|
|
m_captureNames[groupIndex],
|
|
s.substr(rangeStart, rangeEnd - rangeStart)
|
|
)
|
|
);
|
|
}
|
|
|
|
matches.push_back(match);
|
|
|
|
// Count matches found in this SingleRegex
|
|
matchesCount++;
|
|
|
|
// continue searching for next match starting from end of this match
|
|
// (first two entries in ov[] are start and end offsets of current full match)
|
|
startOffset = ov[1];
|
|
} while (matchesCount < maxMatches);
|
|
|
|
return matchesCount;
|
|
}
|
|
|
|
const std::string &SingleRegex::getName() const
|
|
{
|
|
return m_regexName;
|
|
}
|
|
|
|
size_t SingleRegex::findMatchRanges(const std::string& s, std::vector<RegexMatchRange>& matchRanges) const {
|
|
PCRE2_SIZE startOffset = 0;
|
|
|
|
do {
|
|
int rc = pcre2_match(
|
|
m_re, // code
|
|
reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), // subject/subject length
|
|
startOffset, // start offset
|
|
0, // options
|
|
m_matchData,
|
|
NULL // match_context
|
|
);
|
|
|
|
// Note: PCRE2_ERROR_NOMATCH is the normal situation here, but there could be other errors.
|
|
// However, whichever error occurred, the loop is stopped.
|
|
if (rc <= 0) {
|
|
if (rc != PCRE2_ERROR_NOMATCH) {
|
|
PCRE2_UCHAR errmsg[4096];
|
|
pcre2_get_error_message(rc, errmsg, sizeof(errmsg) - 1);
|
|
dbgDebug(D_WAAP_REGEX) << "SingleRegex['" << m_regexName << "']::findMatchRanges " <<
|
|
"failed with error code: " << rc << " ('" << errmsg << "')";
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Get pointer to array of offsets into s
|
|
PCRE2_SIZE* ov = pcre2_get_ovector_pointer(m_matchData);
|
|
|
|
// start searching for next match starting from end of this match
|
|
// (first two entries in ov[] are start and end offsets of current full match)
|
|
startOffset = ov[1];
|
|
|
|
matchRanges.push_back(RegexMatchRange(ov[0], ov[1]));
|
|
} while (true);
|
|
|
|
return matchRanges.size();
|
|
}
|
|
|
|
// Regex
|
|
|
|
Regex::Regex(const std::string& pattern, bool &error, const std::string& regexName)
|
|
:
|
|
m_regexName(regexName),
|
|
m_regexPreconditions(nullptr) // no need for preconditions for single regex mode
|
|
{
|
|
if (error) {
|
|
// Skip initialization if already in error condition
|
|
dbgError(D_WAAP_REGEX) << "Skip compiling regex: " << m_regexName << " (single) due to previous error";
|
|
return;
|
|
}
|
|
|
|
m_sre.push_back(new SingleRegex(pattern, error, m_regexName));
|
|
}
|
|
|
|
// Divide regexp patterns longer than the limit (imposed by pcre2 library!) into multiple regexes.
|
|
#define REGEX_PATT_MAX_SIZE 0
|
|
|
|
Regex::Regex(
|
|
const std::vector<std::string> & patterns,
|
|
bool &error,
|
|
const std::string & regexName,
|
|
std::shared_ptr<Waap::RegexPreconditions> regexPreconditions)
|
|
:
|
|
m_regexName(regexName),
|
|
m_regexPreconditions(regexPreconditions)
|
|
{
|
|
if (error) {
|
|
// Skip initialization if already in error condition
|
|
dbgError(D_WAAP_REGEX) << "Skip compiling regex: " << m_regexName << " due to previous error";
|
|
return;
|
|
}
|
|
|
|
// This regex helps to parse out group names from regex patterns
|
|
SingleRegex patternParseRegex("^\\(\\?P<(.*?)>(.*?)\\)$", error, "patternParseRegex");
|
|
|
|
std::string acc;
|
|
|
|
for (std::vector<std::string>::const_iterator pPattern = patterns.begin();
|
|
pPattern != patterns.end();
|
|
++pPattern) {
|
|
const std::string& pattern = *pPattern;
|
|
if ((acc.size() + pattern.size()) > REGEX_PATT_MAX_SIZE) {
|
|
if (!acc.empty()) {
|
|
assert(false); // this should never happen
|
|
m_sre.push_back(new SingleRegex(acc + ")", error, m_regexName));
|
|
acc = "(" + pattern;
|
|
}
|
|
else
|
|
{
|
|
bool bNoRegex = false;
|
|
std::string regexMatchName;
|
|
std::string regexMatchValue;
|
|
|
|
// This is the only place where patterns are loaded (one-by-one)
|
|
if (m_regexPreconditions) {
|
|
// If preconditions are enabled on this Regex instance - build list of indices of SingleRegex
|
|
// that should be triggered (executed) for each related word found by aho-corasick pattern scan.
|
|
Waap::RegexPreconditions::WordIndex wordIndex =
|
|
m_regexPreconditions->getWordByRegex(pattern);
|
|
|
|
// Extract group name from the regex pattern string
|
|
if (m_regexPreconditions->isNoRegexPattern(pattern)) {
|
|
// This word should not be scanned with regex. Instead, it should directly return a match
|
|
std::vector <RegexMatch> parsedMatches;
|
|
patternParseRegex.findAllMatches(pattern, parsedMatches);
|
|
bNoRegex = true;
|
|
regexMatchName = parsedMatches[0].groups[0].value;
|
|
regexMatchValue = m_regexPreconditions->getWordStrByWordIndex(wordIndex);
|
|
}
|
|
|
|
// For each word - build list of SingleRegex indices to be scanned if that word is detected
|
|
// Note that if aho-corasick word for this regex is not yet defined it will enter the [""] entry
|
|
// and will always be executed. This is less efficient but ensures correct attack detection.
|
|
m_wordToRegexIndices[wordIndex].push_back(m_sre.size());
|
|
}
|
|
else {
|
|
// If preconditions are not enabled on this Regex instance - all SingleRegexes in it will always
|
|
// be executed.
|
|
m_wordToRegexIndices[Waap::RegexPreconditions::emptyWordIndex].push_back(m_sre.size());
|
|
}
|
|
|
|
m_sre.push_back(new SingleRegex("(" + pattern+ ")", error, m_regexName + "/" + pattern, bNoRegex,
|
|
regexMatchName, regexMatchValue));
|
|
}
|
|
}
|
|
else {
|
|
assert(false); // this should never happen anymore.
|
|
// Add | character between individual patterns, but not before the very first one!
|
|
if (acc.empty()) {
|
|
// first group
|
|
acc = "(" + pattern;
|
|
}
|
|
else {
|
|
// non-first group
|
|
acc += "|" + pattern;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (acc.size() > 0) {
|
|
assert(false); // this should never happen anymore.
|
|
m_sre.push_back(new SingleRegex(acc + ")", error, m_regexName));
|
|
}
|
|
}
|
|
|
|
Regex::~Regex() {
|
|
for (std::vector<SingleRegex*>::iterator ppSingleRegex = m_sre.begin();
|
|
ppSingleRegex != m_sre.end();
|
|
++ppSingleRegex) {
|
|
SingleRegex* pSingleRegex = *ppSingleRegex;
|
|
|
|
if (pSingleRegex) {
|
|
delete pSingleRegex;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Regex::hasMatch(const std::string& s) const {
|
|
for (std::vector<SingleRegex*>::const_iterator ppSingleRegex = m_sre.begin();
|
|
ppSingleRegex != m_sre.end();
|
|
++ppSingleRegex) {
|
|
SingleRegex* pSingleRegex = *ppSingleRegex;
|
|
|
|
if (pSingleRegex->hasMatch(s)) {
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']['" << pSingleRegex->getName() <<
|
|
"']::hasMatch() found!";
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
size_t Regex::findAllMatches(const std::string& s, std::vector<RegexMatch>& matches,
|
|
const Waap::RegexPreconditions::PmWordSet *pmWordSet, size_t maxMatches) const {
|
|
matches.clear();
|
|
|
|
if (m_regexPreconditions && pmWordSet) {
|
|
// If preconditions are enabled on this regex - execute them to make scanning more efficient
|
|
std::unordered_set<size_t> dupIndices;
|
|
|
|
for (Waap::RegexPreconditions::WordIndex wordIndex : *pmWordSet) {
|
|
const auto &found = m_wordToRegexIndices.find(wordIndex);
|
|
|
|
// Check that the wordIndex is related to this instance of Regex object
|
|
if (found == m_wordToRegexIndices.end()) {
|
|
continue;
|
|
}
|
|
|
|
const std::vector<size_t> ®exIndicesList = found->second;
|
|
|
|
for (size_t regexIndex : regexIndicesList) {
|
|
if (dupIndices.find(regexIndex) != dupIndices.end()) {
|
|
// Avoid scanning the same regex index twice (in case it is registered for more than one wordIndex)
|
|
continue;
|
|
}
|
|
|
|
// Scan only regexes that are enabled by aho-corasick scan
|
|
m_sre[regexIndex]->findAllMatches(s, matches, maxMatches);
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_sre[regexIndex]->getName() <<
|
|
"',index=" << regexIndex << "]::findAllMatches(): " << matches.size() << " matches found (so far)";
|
|
|
|
dupIndices.insert(regexIndex);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// When optimization is disabled - scan all regexes
|
|
for (SingleRegex* pSingleRegex : m_sre) {
|
|
pSingleRegex->findAllMatches(s, matches, maxMatches);
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']['" << pSingleRegex->getName() <<
|
|
"']::findAllMatches(): " << matches.size() << " matches found (so far)";
|
|
}
|
|
}
|
|
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']::findAllMatches(): total " <<
|
|
matches.size() << " matches found.";
|
|
return matches.size();
|
|
}
|
|
|
|
inline bool consolidateMatchRangesSortFunc(const RegexMatchRange& a, const RegexMatchRange& b) {
|
|
return a.start > b.start;
|
|
}
|
|
|
|
// Consolidate ranges in-place (algorithm adapted from this solution:
|
|
// http://www.geeksforgeeks.org/merging-intervals)
|
|
static void consolidateMatchRanges(std::vector<RegexMatchRange>& matchRanges) {
|
|
// Sort ranges in decreasing order of their start offsets (O(logN) time)
|
|
std::sort(matchRanges.begin(), matchRanges.end(), consolidateMatchRangesSortFunc);
|
|
int lastIndex = 0; // index of last range in matchRanges vector (up to this range everything is merged)
|
|
|
|
// Traverse all ranges and merge where necessary
|
|
for (size_t i = 0; i < matchRanges.size(); ++i) {
|
|
// If this is not first range and it overlaps with the previous range
|
|
if (lastIndex != 0 && matchRanges[lastIndex - 1].start < matchRanges[i].end) {
|
|
while (lastIndex != 0 && matchRanges[lastIndex - 1].start < matchRanges[i].end) {
|
|
// merge previous and current ranges
|
|
matchRanges[lastIndex - 1].end = std::max(matchRanges[lastIndex - 1].end, matchRanges[i].end);
|
|
matchRanges[lastIndex - 1].start = std::min(matchRanges[lastIndex - 1].start, matchRanges[i].start);
|
|
lastIndex--;
|
|
}
|
|
}
|
|
else {
|
|
// Doesn't overlap with previous (or no previous because this is first range),
|
|
// add the range as-is
|
|
matchRanges[lastIndex] = matchRanges[i];
|
|
}
|
|
|
|
lastIndex++;
|
|
}
|
|
|
|
// Keep only merged ranges. Erase extra ranges that are not used anymore
|
|
matchRanges.resize(lastIndex);
|
|
}
|
|
|
|
std::string Regex::sub(const std::string& s, const std::string& repl) const {
|
|
std::vector<RegexMatchRange> matchRanges;
|
|
|
|
// Find all ranges of all matches
|
|
for (std::vector<SingleRegex*>::const_iterator ppSingleRegex = m_sre.begin();
|
|
ppSingleRegex != m_sre.end();
|
|
++ppSingleRegex) {
|
|
SingleRegex* pSingleRegex = *ppSingleRegex;
|
|
pSingleRegex->findMatchRanges(s, matchRanges);
|
|
#ifdef WAF2_LOGGING_ENABLE
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']['" << pSingleRegex->getName() <<
|
|
"']::sub(): " << matchRanges.size() << " match ranges found (so far):";
|
|
for (size_t i = 0; i < matchRanges.size(); ++i) {
|
|
dbgTrace(D_WAAP_REGEX) << "Range [" << i << "]: " << matchRanges[i].start << " -> " << matchRanges[i].end;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// No matches - nothing to replace.
|
|
if (matchRanges.empty()) {
|
|
return s;
|
|
}
|
|
|
|
// Match ranges collected from multiple single regexps could overlap and be out of order
|
|
// This function sorts the ranges in place (in decreasing order) and also consolidates overlapping
|
|
// ranges so they do not overlap.
|
|
consolidateMatchRanges(matchRanges);
|
|
|
|
#ifdef WAF2_LOGGING_ENABLE
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']::sub(): " <<
|
|
matchRanges.size() << " match ranges (after consolidation):";
|
|
for (size_t i = 0; i < matchRanges.size(); ++i) {
|
|
dbgTrace(D_WAAP_REGEX) << "Range [" << i << "]: " << matchRanges[i].start << " -> " << matchRanges[i].end;
|
|
}
|
|
#endif
|
|
|
|
// Now walk over (consolidated) ranges (that are now guaranteed not to overlap), and copy everything around them
|
|
// Note that ranges are still sorted in decreasing order, so we traverse the list backwards to see them in
|
|
// increasing order
|
|
PCRE2_SIZE startOffset = 0;
|
|
std::string outStr;
|
|
|
|
for (std::vector<RegexMatchRange>::const_reverse_iterator pMatchRange = matchRanges.rbegin();
|
|
pMatchRange != matchRanges.rend();
|
|
++pMatchRange) {
|
|
// Add everything since startOffset until start of current range
|
|
outStr += s.substr(startOffset, pMatchRange->start - startOffset);
|
|
|
|
// Add replacement
|
|
if (!repl.empty()) {
|
|
outStr += repl;
|
|
}
|
|
// Keep copying only after end of current range
|
|
startOffset = pMatchRange->end;
|
|
}
|
|
|
|
// Add remainder of string after last range
|
|
outStr += s.substr(startOffset);
|
|
return outStr;
|
|
}
|
|
|
|
// TODO:: refactor out with C++ functor instead of C-style pointer-callback!
|
|
void
|
|
Regex::sub(
|
|
const std::string& s,
|
|
Waap::Util::RegexSubCallback_f cb,
|
|
int& decodedCount,
|
|
int& deletedCount,
|
|
std::string& outStr) const
|
|
{
|
|
decodedCount = 0;
|
|
deletedCount = 0;
|
|
|
|
// Clear outStr, it will be filled with output string (with changes, if applicable)
|
|
outStr.clear();
|
|
|
|
std::vector<RegexMatchRange> matchRanges;
|
|
|
|
// Find all ranges of all matches
|
|
for (std::vector<SingleRegex*>::const_iterator ppSingleRegex = m_sre.begin();
|
|
ppSingleRegex != m_sre.end();
|
|
++ppSingleRegex) {
|
|
SingleRegex* pSingleRegex = *ppSingleRegex;
|
|
pSingleRegex->findMatchRanges(s, matchRanges);
|
|
#ifdef WAF2_LOGGING_ENABLE
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']['" << pSingleRegex->getName()
|
|
<< "']::sub(): " << matchRanges.size() << " match ranges found (so far):";
|
|
for (size_t i = 0; i < matchRanges.size(); ++i) {
|
|
dbgTrace(D_WAAP_REGEX) << "Range [" << i << "]: " << matchRanges[i].start << " -> " << matchRanges[i].end;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// No matches - nothing to replace.
|
|
if (matchRanges.empty()) {
|
|
outStr = s;
|
|
return;
|
|
}
|
|
|
|
// Match ranges collected from multiple single regexps could overlap and be out of order
|
|
// This function sorts the ranges in place (in decreasing order) and also consolidates
|
|
// overlapping ranges so they do not overlap.
|
|
consolidateMatchRanges(matchRanges);
|
|
|
|
#ifdef WAF2_LOGGING_ENABLE
|
|
dbgTrace(D_WAAP_REGEX) << "Regex['" << m_regexName << "']::sub(): " <<
|
|
matchRanges.size() << " match ranges (after consolidation):";
|
|
for (size_t i = 0; i < matchRanges.size(); ++i) {
|
|
dbgTrace(D_WAAP_REGEX) << "Range [" << i << "]: " << matchRanges[i].start << " -> " << matchRanges[i].end;
|
|
}
|
|
#endif
|
|
|
|
// Now walk over (consolidated) ranges (that are now guaranteed not to overlap), and copy everything around them
|
|
// Note that ranges are still sorted in decreasing order, so we traverse the list backwards to see them in
|
|
// increasing order
|
|
PCRE2_SIZE startOffset = 0;
|
|
|
|
for (std::vector<RegexMatchRange>::const_reverse_iterator pMatchRange = matchRanges.rbegin();
|
|
pMatchRange != matchRanges.rend();
|
|
++pMatchRange) {
|
|
// Add everything since startOffset until start of current range
|
|
outStr += s.substr(startOffset, pMatchRange->start - startOffset);
|
|
|
|
// Compute replacement
|
|
std::string repl;
|
|
if (cb(s, s.begin() + pMatchRange->start, s.begin() + pMatchRange->end, repl)) {
|
|
if (!repl.empty()) {
|
|
outStr += repl;
|
|
decodedCount++;
|
|
}
|
|
else {
|
|
deletedCount++;
|
|
}
|
|
}
|
|
else {
|
|
// if callback told us the chunk was not processed - put original text inside
|
|
outStr += s.substr(pMatchRange->start, pMatchRange->end - pMatchRange->start);
|
|
}
|
|
|
|
// Keep copying only after end of current range
|
|
startOffset = pMatchRange->end;
|
|
}
|
|
|
|
// Add remainder of string after last range
|
|
outStr += s.substr(startOffset);
|
|
return;
|
|
}
|
|
|
|
const std::string &Regex::getName() const
|
|
{
|
|
return m_regexName;
|
|
}
|