First release of open-appsec source code

2026-01-02 22:54:46 +03:00 · 2022-10-26 19:33:19 +03:00
parent 3883109caf
commit a883352f79
1353 changed files with 276290 additions and 1 deletions
--- a/components/security_apps/waap/waap_clib/WaapRegexPreconditions.cc
+++ b/components/security_apps/waap/waap_clib/WaapRegexPreconditions.cc
@@ -0,0 +1,341 @@
+// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "WaapRegexPreconditions.h"
+#include "Waf2Util.h"
+#include "debug.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+USE_DEBUG_FLAG(D_WAAP_REGEX);
+
+namespace Waap {
+    const RegexPreconditions::WordIndex RegexPreconditions::emptyWordIndex = 0;
+
+    RegexPreconditions::RegexPreconditions(const picojson::value::object &jsObj, bool &error)
+    {
+        // Register empty string work under known index
+        registerWord("");
+
+        // The key should always be there unless data file is corrupted (but there's a unit test that tests exactly
+        // that!)
+        if (jsObj.find("preconditions") == jsObj.end()) {
+            dbgError(D_WAAP_REGEX) << "Error loading regex preconditions (signatures data file corrupt?)...";
+            error = true;
+            return;
+        }
+
+        if (jsObj.find("precondition_keys") == jsObj.end()) {
+            dbgError(D_WAAP_REGEX) << "Error loading regex precondition sets (signatures data file corrupt?)...";
+            error = true;
+            return;
+        }
+
+        auto preconditions = jsObj.at("preconditions").get<picojson::value::object>();
+
+        // Build full list of words to load into aho-corasick pattern matcher
+        dbgTrace(D_WAAP_REGEX) << "Loading regex precondition_keys into Aho-Corasick pattern matcher...";
+
+        auto preconditionKeys = jsObj.at("precondition_keys").get<picojson::value::array>();
+        std::set<PMPattern> pmPatterns;
+
+        for (const auto &preconditionKey : preconditionKeys) {
+            std::string wordStr(preconditionKey.get<std::string>());
+
+            // Do not load the "empty" word into Aho-Corasick. It's meaningless and Aho prepare() call would fail.
+            if (wordStr.empty()) {
+                continue;
+            }
+
+            WordIndex wordIndex = registerWord(wordStr);
+            pmPatterns.insert(PMPattern(wordStr, false, false, wordIndex));
+        }
+
+        // Initialize the aho-corasick pattern matcher with the patterns
+        Maybe<void> pmHookStatus = m_pmHook.prepare(pmPatterns);
+
+        if (!pmHookStatus.ok()) {
+            dbgError(D_WAAP_REGEX) << "Aho-Corasick engine failed to load!";
+            error = true;
+            return;
+        }
+
+        dbgTrace(D_WAAP_REGEX) << "Aho-Corasick engine loaded.";
+
+        // Loop over pre-conditions (rules) and load them
+        dbgTrace(D_WAAP_REGEX) << "Loading regex preconditions...";
+
+        for (const auto &precondition : preconditions)
+        {
+            // Each precondition consists of an aho-corasick pattern matcher word as a key and list of actions
+            // (for that word) - as a value.
+            const std::string wordStr = precondition.first;
+
+            // Information from the "empty string"" word is not required by the engine to operate
+            if (wordStr.empty()) {
+                continue;
+            }
+
+            WordIndex wordIndex = registerWord(wordStr);
+
+            if (boost::algorithm::ends_with(wordStr, "_napost_napre")) {
+                WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napost_napre")));
+                m_pmWordInfo[baseWordIndex].napostNapreWordIndex = wordIndex;
+                m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex;
+            }
+            else if (boost::algorithm::ends_with(wordStr, "_napost")) {
+                WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napost")));
+                m_pmWordInfo[baseWordIndex].napostWordIndex = wordIndex;
+                m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex;
+            }
+            else if (boost::algorithm::ends_with(wordStr, "_napre")) {
+                WordIndex baseWordIndex = registerWord(wordStr.substr(0, wordStr.size() - strlen("_napre")));
+                m_pmWordInfo[baseWordIndex].napreWordIndex = wordIndex;
+                m_pmWordInfo[wordIndex].baseWordIndex = baseWordIndex;
+            }
+
+            // Load actions
+            const auto &jsActionsList = precondition.second.get<picojson::value::array>();
+
+            for (const auto &jsAction : jsActionsList) {
+                const auto &action = jsAction.get<picojson::value::array>();
+
+                if (action.empty()) {
+                    continue;
+                }
+
+                // The first item in the Action json object (it's a tuple of 1 or more items) is an action type string.
+                const std::string actionType = action[0].get<std::string>();
+
+                // There are currently three action types:
+                //  1. "regex" - allow specific regex to be scanned when the Aho-Corasick word is detected
+                //  2. "set" - specify another "prefix" (string) to be enabled when the Aho-Corasick word is detected.
+                //             if at least one prefix is enabled - it will trigger one or more other regexes.
+                //  3. "and_condition" - specify (comma-separated) sorted list of "prefixes" (in one string).
+                //             all of these prefixes should come together in order to complete a set to match a
+                //             condition and enable one or more other regexes.
+                if (actionType == "regex" && action.size() >= 3) {
+                    const std::string regexPattern = action[1].get<std::string>();
+                    if (m_regexToWordMap.find(regexPattern) != m_regexToWordMap.end() &&
+                            m_regexToWordMap[regexPattern] != wordIndex)
+                    {
+                        dbgError(D_WAAP_REGEX) << "ERROR: trying to overwrite m_regexToWordMap. pattern='" <<
+                            regexPattern << "'. Old wordIndex='" << m_regexToWordMap[regexPattern] << "' new word='"
+                            << wordStr << "' (wordIndex=" << wordIndex << ")";
+                        error = true;
+                        return;
+                    }
+
+                    std::string flags = action[2].get<std::string>();
+
+                    if (flags == "_noregex") {
+                        // Add regex pattern to set of "noRegex" patterns
+                        m_noRegexPatterns.insert(regexPattern);
+                    }
+
+                    m_regexToWordMap[regexPattern] = wordIndex;
+                }
+                else if (actionType == "set" && action.size() >= 2) {
+                    const std::string setValueStr = action[1].get<std::string>();
+                    WordIndex setValueIndex = registerWord(setValueStr);
+                    std::vector<WordIndex> &prefixSet = m_wordToPrefixSet[wordIndex];
+                    if (std::find(prefixSet.begin(), prefixSet.end(),
+                            setValueIndex) == prefixSet.end()) {
+                        prefixSet.push_back(setValueIndex);
+                    }
+                }
+                else if (actionType == "and_condition" && action.size() >= 2) {
+                    const std::string groupValueStr = action[1].get<std::string>();
+                    WordIndex groupValueIndex = registerWord(groupValueStr);
+                    size_t expectedCount = static_cast<size_t>(std::stoi(groupValueStr));
+                    auto value(std::make_pair(groupValueIndex, expectedCount));
+                    std::vector<std::pair<WordIndex, size_t>> &prefixGroup = m_wordToPrefixGroup[wordIndex];
+                    if (std::find(prefixGroup.begin(), prefixGroup.end(),
+                            value) == prefixGroup.end()) {
+                        prefixGroup.push_back(value);
+                    }
+                }
+            }
+        }
+
+        dbgTrace(D_WAAP_REGEX) << "Aho-corasick pattern matching engine initialized!";
+    }
+
+    bool Waap::RegexPreconditions::isNoRegexPattern(const std::string &pattern) const
+    {
+        return m_noRegexPatterns.find(pattern) != m_noRegexPatterns.end();
+    }
+
+    const std::string &Waap::RegexPreconditions::getWordStrByWordIndex(WordIndex wordIndex) const
+    {
+        WordIndex baseWordIndex = m_pmWordInfo[wordIndex].baseWordIndex;
+
+        if (baseWordIndex != Waap::RegexPreconditions::emptyWordIndex) {
+            return m_pmWordInfo[baseWordIndex].wordStr;
+        }
+
+        return m_pmWordInfo[wordIndex].wordStr;
+    }
+
+    // Check that the regex pattern (string) is known to be related to an Aho-Corasick word/prefix
+    // Returns empty string if not found, or the Aho-Corasick/prefix string otherwise.
+    // This function is called during each Regex object creation and helps to pre-compute data required for a fast
+    // lookup later during traffic processing.
+    Waap::RegexPreconditions::WordIndex RegexPreconditions::getWordByRegex(const std::string &regexPattern) const
+    {
+        const auto &found = m_regexToWordMap.find(regexPattern);
+
+        if (found != m_regexToWordMap.end()) {
+            return found->second;
+        }
+
+        return Waap::RegexPreconditions::emptyWordIndex;
+    }
+
+    void RegexPreconditions::processWord(RegexPreconditions::PmWordSet &wordsSet, WordIndex wordIndex) const
+    {
+        const auto &found = m_wordToPrefixSet.find(wordIndex);
+
+        if (found != m_wordToPrefixSet.end()) {
+            for (const auto &prefixIndex : found->second) {
+                // One of the items in the "OR" condition - add the OR prefix to the wordsSet
+                wordsSet.insert(prefixIndex);
+            }
+        }
+
+        // Add words from the Aho Corasick scanner
+        wordsSet.insert(wordIndex);
+    }
+
+    inline bool isRegexWordChar(u_char c) {
+        return Waap::Util::isAlphaAsciiFast(c) || isdigit(c) || '_' == c;
+    }
+
+    void RegexPreconditions::pass1(RegexPreconditions::PmWordSet &wordsSet, Buffer &&buffer) const
+    {
+        dbgTrace(D_WAAP_REGEX) << "Rules pass #1: collect OR sets";
+
+        m_pmHook.scanBufWithOffsetLambda(buffer, [this, &wordsSet, &buffer]
+            (u_int endMatchOffset, const PMPattern &pmPattern)
+        {
+            uint offset = endMatchOffset + 1 - pmPattern.size(); // reported offset points to last character of a match
+
+            // Extract the word index from the PMPattern object (we do not need the string part of it)
+            WordIndex wordIndex = pmPattern.getIndex();
+
+            bool regexWordBefore = (offset != 0) &&
+                (isRegexWordChar(buffer.data()[offset - 1]));
+            bool regexWordAfter = (offset + pmPattern.size() < buffer.size()) &&
+                (isRegexWordChar(buffer.data()[offset + pmPattern.size()]));
+
+            processWord(wordsSet, wordIndex);
+
+            // Compute additional constraints ([!\w] before, [!\w] after, [!\w] aroung the match ...)
+            WordIndex napreWordIndex = m_pmWordInfo[wordIndex].napreWordIndex;
+            WordIndex napostWordIndex = m_pmWordInfo[wordIndex].napostWordIndex;
+            WordIndex napostNapreWordIndex = m_pmWordInfo[wordIndex].napostNapreWordIndex;
+
+            if (!regexWordBefore && regexWordAfter) {
+                if (napreWordIndex != emptyWordIndex) {
+                    processWord(wordsSet, napreWordIndex);
+                }
+            }
+            else if (regexWordBefore && !regexWordAfter) {
+                if (napostWordIndex != emptyWordIndex) {
+                    processWord(wordsSet, napostWordIndex);
+                }
+            }
+            else if (!regexWordBefore && !regexWordAfter) {
+                if (napreWordIndex != emptyWordIndex) {
+                    processWord(wordsSet, napreWordIndex);
+                }
+
+                if (napostWordIndex != emptyWordIndex) {
+                    processWord(wordsSet, napostWordIndex);
+                }
+
+                if (napostNapreWordIndex != emptyWordIndex) {
+                    processWord(wordsSet, napostNapreWordIndex);
+                }
+            }
+        });
+    }
+
+    void RegexPreconditions::pass2(RegexPreconditions::PmWordSet &wordsSet) const
+    {
+        dbgTrace(D_WAAP_REGEX) << "Rules pass #2: collect AND groups";
+
+        std::unordered_map<WordIndex, std::set<WordIndex>> allGroups;
+        std::vector<WordIndex> prefixes;
+
+        for (WordIndex wordIndex : wordsSet) {
+            // find in wordToPrefixGroup map
+            const auto &found = m_wordToPrefixGroup.find(wordIndex);
+
+            if (found != m_wordToPrefixGroup.end()) {
+                for (const auto &prefixCountPair : found->second) {
+                    WordIndex prefixIndex = prefixCountPair.first;
+                    size_t expectedCount = prefixCountPair.second;
+
+                    auto found = allGroups.find(prefixIndex);
+                    size_t actualWordCount = 1;
+
+                    if (found == allGroups.end()) {
+                        allGroups.emplace(prefixIndex, std::set<WordIndex>{wordIndex});
+                    }
+                    else {
+                        found->second.insert(wordIndex);
+                        actualWordCount = found->second.size();
+                    }
+
+                    if (actualWordCount == expectedCount) {
+                        // Full "AND" condition collected succesfully - add the AND prefixCountPair to the wordsSet
+                        prefixes.push_back(prefixIndex);
+                    }
+                }
+            }
+        }
+
+        for (const auto &prefixIndex : prefixes) {
+            wordsSet.insert(prefixIndex);
+        }
+    }
+
+    // This function scans the buffer with Aho-Corasick scanner and adds all the words found into wordsSet
+    // It then continues and runs two pass algorithm to compute OR and AND conditions over a prefixes data.
+    // The prefix strings are also added to the wordsSet and are looked up in the same database.
+    void RegexPreconditions::pmScan(Buffer &&buffer, RegexPreconditions::PmWordSet &wordsSet) const
+    {
+        wordsSet.clear();
+        pass1(wordsSet, std::move(buffer));
+        pass2(wordsSet);
+        // The empty string key contains all regexes that should always be scanned
+        wordsSet.insert(Waap::RegexPreconditions::emptyWordIndex);
+    }
+
+    // Get known wordIndex by wordStr, or allocate a new wordIndex for words yet unknown
+    Waap::RegexPreconditions::WordIndex RegexPreconditions::registerWord(const std::string &wordStr)
+    {
+        const auto &found = m_wordStrToIndex.find(wordStr);
+        if (found != m_wordStrToIndex.end()) {
+            return found->second;
+        }
+        else {
+            WordIndex wordIndex = m_pmWordInfo.size();
+            m_wordStrToIndex[wordStr] = wordIndex; // index of the new element that will be added below...
+            WordInfo wordInfo;
+            wordInfo.wordStr = wordStr;
+            m_pmWordInfo.push_back(wordInfo);
+            return wordIndex;
+        }
+    }
+}