// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "WaapRegexPreconditions.h" #include "Waf2Util.h" #include "debug.h" #include USE_DEBUG_FLAG(D_WAAP_REGEX); namespace Waap { const RegexPreconditions::WordIndex RegexPreconditions::emptyWordIndex = 0; RegexPreconditions::RegexPreconditions(const picojson::value::object &jsObj, bool &error) { // Register empty string work under known index registerWord(""); // The key should always be there unless data file is corrupted (but there's a unit test that tests exactly // that!) if (jsObj.find("preconditions") == jsObj.end()) { dbgError(D_WAAP_REGEX) << "Error loading regex preconditions (signatures data file corrupt?)..."; error = true; return; } if (jsObj.find("precondition_keys") == jsObj.end()) { dbgError(D_WAAP_REGEX) << "Error loading regex precondition sets (signatures data file corrupt?)..."; error = true; return; } auto preconditions = jsObj.at("preconditions").get(); // Loop over pre-conditions (rules) and load them dbgTrace(D_WAAP_REGEX) << "Loading regex preconditions..."; for (const auto &precondition : preconditions) { // Each precondition consists of an aho-corasick pattern matcher word as a key and list of actions // (for that word) - as a value. const std::string wordStr = precondition.first; // Information from the "empty string"" word is not required by the engine to operate if (wordStr.empty()) { continue; } WordIndex wordIndex = registerWord(wordStr); if (boost::algorithm::ends_with(wordStr, "_napost_napre")) { WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napost_napre"))); m_pmWordInfo[baseWordIndex].napostNapreWordIndex = wordIndex; m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex; } else if (boost::algorithm::ends_with(wordStr, "_napost")) { WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napost"))); m_pmWordInfo[baseWordIndex].napostWordIndex = wordIndex; m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex; } else if (boost::algorithm::ends_with(wordStr, "_napre")) { WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napre"))); m_pmWordInfo[baseWordIndex].napreWordIndex = wordIndex; m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex; } // Load actions const auto &jsActionsList = precondition.second.get(); for (const auto &jsAction : jsActionsList) { const auto &action = jsAction.get(); if (action.empty()) { continue; } // The first item in the Action json object (it's a tuple of 1 or more items) is an action type string. const std::string actionType = action[0].get(); // There are currently three action types: // 1. "regex" - allow specific regex to be scanned when the Aho-Corasick word is detected // 2. "set" - specify another "prefix" (string) to be enabled when the Aho-Corasick word is detected. // if at least one prefix is enabled - it will trigger one or more other regexes. // 3. "and_condition" - specify (comma-separated) sorted list of "prefixes" (in one string). // all of these prefixes should come together in order to complete a set to match a // condition and enable one or more other regexes. if (actionType == "regex" && action.size() >= 3) { const std::string regexPattern = action[1].get(); if (m_regexToWordMap.find(regexPattern) != m_regexToWordMap.end() && m_regexToWordMap[regexPattern] != wordIndex) { dbgError(D_WAAP_REGEX) << "ERROR: trying to overwrite m_regexToWordMap. pattern='" << regexPattern << "'. Old wordIndex='" << m_regexToWordMap[regexPattern] << "' new word='" << wordStr << "' (wordIndex=" << wordIndex << ")"; error = true; return; } std::string flags = action[2].get(); if (flags == "_noregex") { // Add regex pattern to set of "noRegex" patterns m_noRegexPatterns.insert(regexPattern); m_pmWordInfo[wordIndex].noRegex = true; } m_regexToWordMap[regexPattern] = wordIndex; } else if (actionType == "set" && action.size() >= 2) { const std::string setValueStr = action[1].get(); WordIndex setValueIndex = registerWord(setValueStr); std::vector &prefixSet = m_wordToPrefixSet[wordIndex]; if (std::find(prefixSet.begin(), prefixSet.end(), setValueIndex) == prefixSet.end()) { prefixSet.push_back(setValueIndex); } } else if (actionType == "and_condition" && action.size() >= 2) { const std::string groupValueStr = action[1].get(); WordIndex groupValueIndex = registerWord(groupValueStr); size_t expectedCount = static_cast(std::stoi(groupValueStr)); auto value(std::make_pair(groupValueIndex, expectedCount)); std::vector> &prefixGroup = m_wordToPrefixGroup[wordIndex]; if (std::find(prefixGroup.begin(), prefixGroup.end(), value) == prefixGroup.end()) { prefixGroup.push_back(value); } } } } // Build full list of words to load into aho-corasick pattern matcher dbgTrace(D_WAAP_REGEX) << "Loading regex precondition_keys into Aho-Corasick pattern matcher..."; auto preconditionKeys = jsObj.at("precondition_keys").get(); std::set pmPatterns; for (const auto &preconditionKey : preconditionKeys) { std::string wordStr(preconditionKey.get()); // Do not load the "empty" word into Aho-Corasick. It's meaningless and Aho prepare() call would fail. if (wordStr.empty()) { continue; } WordIndex wordIndex = registerWord(wordStr); WordIndex napreWordIndex = m_pmWordInfo[wordIndex].napreWordIndex; WordIndex napostWordIndex = m_pmWordInfo[wordIndex].napostWordIndex; WordIndex napostNapreWordIndex = m_pmWordInfo[wordIndex].napostNapreWordIndex; bool noRegex = ((napreWordIndex != emptyWordIndex) && m_pmWordInfo[napreWordIndex].noRegex) || ((napostWordIndex != emptyWordIndex) && m_pmWordInfo[napostWordIndex].noRegex) || ((napostNapreWordIndex != emptyWordIndex) && m_pmWordInfo[napostNapreWordIndex].noRegex); pmPatterns.insert(PMPattern(wordStr, false, false, wordIndex, noRegex)); } // Initialize the aho-corasick pattern matcher with the patterns Maybe pmHookStatus = m_pmHook.prepare(pmPatterns); if (!pmHookStatus.ok()) { dbgError(D_WAAP_REGEX) << "Aho-Corasick engine failed to load!"; error = true; return; } dbgTrace(D_WAAP_REGEX) << "Aho-Corasick engine loaded."; dbgTrace(D_WAAP_REGEX) << "Aho-corasick pattern matching engine initialized!"; } bool Waap::RegexPreconditions::isNoRegexPattern(const std::string &pattern) const { return m_noRegexPatterns.find(pattern) != m_noRegexPatterns.end(); } const std::string &Waap::RegexPreconditions::getWordStrByWordIndex(WordIndex wordIndex) const { WordIndex baseWordIndex = m_pmWordInfo[wordIndex].baseWordIndex; if (baseWordIndex != Waap::RegexPreconditions::emptyWordIndex) { return m_pmWordInfo[baseWordIndex].wordStr; } return m_pmWordInfo[wordIndex].wordStr; } // Check that the regex pattern (string) is known to be related to an Aho-Corasick word/prefix // Returns empty string if not found, or the Aho-Corasick/prefix string otherwise. // This function is called during each Regex object creation and helps to pre-compute data required for a fast // lookup later during traffic processing. Waap::RegexPreconditions::WordIndex RegexPreconditions::getWordByRegex(const std::string ®exPattern) const { const auto &found = m_regexToWordMap.find(regexPattern); if (found != m_regexToWordMap.end()) { return found->second; } return Waap::RegexPreconditions::emptyWordIndex; } void RegexPreconditions::processWord(RegexPreconditions::PmWordSet &wordsSet, WordIndex wordIndex) const { const auto &found = m_wordToPrefixSet.find(wordIndex); if (found != m_wordToPrefixSet.end()) { for (const auto &prefixIndex : found->second) { // One of the items in the "OR" condition - add the OR prefix to the wordsSet wordsSet.insert(prefixIndex); } } // Add words from the Aho Corasick scanner wordsSet.insert(wordIndex); } inline bool isRegexWordChar(u_char c) { return Waap::Util::isAlphaAsciiFast(c) || isdigit(c) || '_' == c; } void RegexPreconditions::pass1(RegexPreconditions::PmWordSet &wordsSet, Buffer &&buffer) const { dbgTrace(D_WAAP_REGEX) << "Rules pass #1: collect OR sets"; m_pmHook.scanBufWithOffsetLambda(buffer, [this, &wordsSet, &buffer] (u_int endMatchOffset, const PMPattern &pmPattern, bool matchAll) { uint offset = endMatchOffset + 1 - pmPattern.size(); // reported offset points to last character of a match // Extract the word index from the PMPattern object (we do not need the string part of it) WordIndex wordIndex = pmPattern.getIndex(); bool regexWordBefore = !matchAll && (offset != 0) && (isRegexWordChar(buffer.data()[offset - 1])); bool regexWordAfter = !matchAll && (offset + pmPattern.size() < buffer.size()) && (isRegexWordChar(buffer.data()[offset + pmPattern.size()])); processWord(wordsSet, wordIndex); // Compute additional constraints ([!\w] before, [!\w] after, [!\w] aroung the match ...) WordIndex napreWordIndex = m_pmWordInfo[wordIndex].napreWordIndex; WordIndex napostWordIndex = m_pmWordInfo[wordIndex].napostWordIndex; WordIndex napostNapreWordIndex = m_pmWordInfo[wordIndex].napostNapreWordIndex; if (!regexWordBefore && regexWordAfter) { if (napreWordIndex != emptyWordIndex) { processWord(wordsSet, napreWordIndex); } } else if (regexWordBefore && !regexWordAfter) { if (napostWordIndex != emptyWordIndex) { processWord(wordsSet, napostWordIndex); } } else if (!regexWordBefore && !regexWordAfter) { if (napreWordIndex != emptyWordIndex) { processWord(wordsSet, napreWordIndex); } if (napostWordIndex != emptyWordIndex) { processWord(wordsSet, napostWordIndex); } if (napostNapreWordIndex != emptyWordIndex) { processWord(wordsSet, napostNapreWordIndex); } } }); } void RegexPreconditions::pass2(RegexPreconditions::PmWordSet &wordsSet) const { dbgTrace(D_WAAP_REGEX) << "Rules pass #2: collect AND groups"; std::unordered_map> allGroups; std::vector prefixes; for (WordIndex wordIndex : wordsSet) { // find in wordToPrefixGroup map const auto &found = m_wordToPrefixGroup.find(wordIndex); if (found != m_wordToPrefixGroup.end()) { for (const auto &prefixCountPair : found->second) { WordIndex prefixIndex = prefixCountPair.first; size_t expectedCount = prefixCountPair.second; auto found = allGroups.find(prefixIndex); size_t actualWordCount = 1; if (found == allGroups.end()) { allGroups.emplace(prefixIndex, std::set{wordIndex}); } else { found->second.insert(wordIndex); actualWordCount = found->second.size(); } if (actualWordCount == expectedCount) { // Full "AND" condition collected succesfully - add the AND prefixCountPair to the wordsSet prefixes.push_back(prefixIndex); } } } } for (const auto &prefixIndex : prefixes) { wordsSet.insert(prefixIndex); } } // This function scans the buffer with Aho-Corasick scanner and adds all the words found into wordsSet // It then continues and runs two pass algorithm to compute OR and AND conditions over a prefixes data. // The prefix strings are also added to the wordsSet and are looked up in the same database. void RegexPreconditions::pmScan(Buffer &&buffer, RegexPreconditions::PmWordSet &wordsSet) const { wordsSet.clear(); pass1(wordsSet, std::move(buffer)); pass2(wordsSet); // The empty string key contains all regexes that should always be scanned wordsSet.insert(Waap::RegexPreconditions::emptyWordIndex); } // Get known wordIndex by wordStr, or allocate a new wordIndex for words yet unknown Waap::RegexPreconditions::WordIndex RegexPreconditions::registerWord(const std::string &wordStr) { const auto &found = m_wordStrToIndex.find(wordStr); if (found != m_wordStrToIndex.end()) { return found->second; } else { WordIndex wordIndex = m_pmWordInfo.size(); m_wordStrToIndex[wordStr] = wordIndex; // index of the new element that will be added below... WordInfo wordInfo; wordInfo.wordStr = wordStr; m_pmWordInfo.push_back(wordInfo); return wordIndex; } } }