// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "Signatures.h" #include "AssertionRegexes.h" #include "agent_core_utilities.h" #include "debug.h" #include "waap.h" #include // for getenv #include // for strcmp #include USE_DEBUG_FLAG(D_WAAP); USE_DEBUG_FLAG(D_WAAP_SAMPLE_SCAN); USE_DEBUG_FLAG(D_WAAP_HYPERSCAN); typedef picojson::value::object JsObj; typedef picojson::value JsVal; typedef picojson::value::array JsArr; typedef std::map> filtered_parameters_t; static std::vector to_strvec(const picojson::value::array &jsV) { std::vector r; for (auto it = jsV.begin(); it != jsV.end(); ++it) { r.push_back(it->get()); } return r; } static std::set to_strset(const picojson::value::array &jsA) { std::set r; for (auto it = jsA.begin(); it != jsA.end(); ++it) { r.insert(it->get()); } return r; } static std::map to_regexmap(const picojson::value::object &jsO, bool &error) { std::map r; for (auto it = jsO.begin(); it != jsO.end(); ++it) { const std::string &n = it->first; // convert name to lowercase now (so we don't need to do it at runtime every time). std::string n_lower; for (std::string::const_iterator pCh = n.begin(); pCh != n.end(); ++pCh) { n_lower += std::tolower(*pCh); } const picojson::value &v = it->second; if (error) { // stop loading regexes if there's previous error... break; } // Pointers to Regex instances are stored instead of instances themselves to avoid // the need to make the Regex objects copyable. // However, these pointers must be freed by the holder of the returned map! // note: in our case this freeing is happening in the destructor of the WaapAssetState class. r[n] = new Regex(v.get(), error, n_lower); } return r; } static filtered_parameters_t to_filtermap(const picojson::value::object &JsObj) { filtered_parameters_t result; for (auto it = JsObj.begin(); it != JsObj.end(); ++it) { const std::string parameter = it->first; const picojson::value::array &arr = it->second.get(); result[parameter] = to_strvec(arr); } return result; } Signatures::Signatures(const std::string& filepath) : sigsSource(loadSource(filepath)), error(false), m_regexPreconditions(std::make_shared(sigsSource, error)), words_regex( to_strvec(sigsSource["words_regex_list"].get()), error, "words_regex_list", m_regexPreconditions ), specific_acuracy_keywords_regex( to_strvec(sigsSource["specific_acuracy_keywords_regex_list"].get()), error, "specific_acuracy_keywords_regex_list", m_regexPreconditions ), pattern_regex( to_strvec(sigsSource["pattern_regex_list"].get()), error, "pattern_regex_list", m_regexPreconditions ), un_escape_pattern(sigsSource["un_escape_pattern"].get(), error, "un_escape_pattern"), quotes_ev_pattern(sigsSource["quotes_ev_pattern"].get(), error, "quotes_ev_pattern"), comment_ev_pattern(sigsSource["comment_ev_pattern"].get(), error, "comment_ev_pattern"), quotes_space_ev_pattern( sigsSource["quotes_space_ev_fast_reg"].get(), error, "quotes_space_ev_fast_reg" ), allowed_text_re(sigsSource["allowed_text_re"].get(), error, "allowed_text_re"), pipe_split_re( "([^|]*)\\||([^|]+)|\\|()", error, "pipe_decode"), semicolon_split_re("([\\w\\=\\-\\_\\.\\,\\(\\)\\%]+?);|([\\w\\=\\-\\_\\.\\,\\(\\)\\%]+)|;()", error, "sem_decode"), longtext_re(sigsSource["longtext_re"].get(), error, "longtext_re"), nospaces_long_value_re("^[^\\s]{16,}$", error, "nospaces_long_value_re"), good_header_name_re(sigsSource["good_header_name_re"].get(), error, "good_header_name"), good_header_value_re(sigsSource["good_header_value_re"].get(), error, "good_header_value"), ignored_for_nospace_long_value( to_strset(sigsSource["ignored_for_nospace_long_value"].get())), global_ignored_keywords( to_strset( sigsSource["global_ignored"].get()["keys"].get() ) ), global_ignored_patterns( to_strset( sigsSource["global_ignored"].get()["patterns"].get() ) ), url_ignored_keywords( to_strset( sigsSource["ignored_for_url"].get()["keys"].get() ) ), url_ignored_patterns( to_strset( sigsSource["ignored_for_url"].get()["patterns"].get() ) ), url_ignored_re( sigsSource["ignored_for_url"].get()["regex"].get(), error, "url_ignored" ), header_ignored_keywords( to_strset( sigsSource["ignored_for_headers"].get()["keys"].get() ) ), header_ignored_patterns( to_strset( sigsSource["ignored_for_headers"].get() ["patterns"].get() ) ), header_ignored_re( sigsSource["ignored_for_headers"].get()["regex"].get(), error, "header_ignored" ), filter_parameters( to_filtermap( sigsSource["filter_parameters"].get() ) ), m_attack_types( to_filtermap( sigsSource["attack_types_map"].get() ) ), // Removed by Pavel's request. Leaving here in case he'll want to add this back... #if 0 cookie_ignored_keywords( to_strset( sigsSource["ignored_for_cookies"].get()["keys"].get() ) ), cookie_ignored_patterns( to_strset( sigsSource["ignored_for_cookies"].get() ["patterns"].get() ) ), cookie_ignored_re( sigsSource["ignored_for_cookies"].get()["regex"].get(), error, "cookie_ignored" ), #endif php_serialize_identifier("^(N;)|^([ibdsOoCcRra]:\\d+)", error, "php_serialize_identifier"), html_regex("(<(?>body|head)\\b.*>(?>.|[\\r\\n]){0,400}){2}|.+\\|)+.+}"), headers_re(to_regexmap(sigsSource["headers_re"].get(), error)), format_magic_binary_re(sigsSource["format_magic_binary_re"].get(), error, "format_magic_binary_re"), params_type_re(to_regexmap(sigsSource["format_types_regex_list"].get(), error)), resp_hdr_pattern_regex_list(to_strvec(sigsSource["resp_hdr_pattern_regex_list"].get()), error, "resp_hdr_pattern_regex_list", nullptr), resp_hdr_words_regex_list(to_strvec(sigsSource["resp_hdr_words_regex_list"].get()), error, "resp_hdr_words_regex_list", nullptr), resp_body_pattern_regex_list(to_strvec(sigsSource["resp_body_pattern_regex_list"].get()), error, "resp_body_pattern_regex_list", nullptr), resp_body_words_regex_list(to_strvec(sigsSource["resp_body_words_regex_list"].get()), error, "resp_body_words_regex_list", nullptr), remove_keywords_always( to_strset(sigsSource["remove_keywords_always"].get())), user_agent_prefix_re(sigsSource["user_agent_prefix_re"].get()), binary_data_kw_filter(sigsSource["binary_data_kw_filter"].get()), wbxml_data_kw_filter(sigsSource["wbxml_data_kw_filter"].get()), m_hyperscanInitialized(false) { // Only preprocess hyperscan patterns if hyperscan is enabled bool should_use_hyperscan = Signatures::shouldUseHyperscan(); if (should_use_hyperscan) { preprocessHyperscanPatterns(); } } Signatures::~Signatures() { } bool Signatures::fail() { return error; } // Static helper to process assertion flags for a pattern (for testing and internal use) std::string Signatures::processAssertions(const std::string &groupName, const std::string &pattern, AssertionFlags &flags) { std::string processed = pattern; // Use regexes from AssertionRegexes namespace to detect assertions at start/end of the pattern string using namespace Waap::AssertionRegexes; boost::smatch match; // Start assertions - only a single '(' can precede if (boost::regex_search(processed, match, reStartNonWordBehind) && match.position() >= 0) { flags.setFlag(AssertionFlag::START_NON_WORD_BEHIND); processed = boost::regex_replace(processed, reStartNonWordBehind, std::string("")); } // Path traversal start assertion if (boost::regex_search(processed, match, rePathTraversalStart) && match.position() >= 0) { flags.setFlag(AssertionFlag::PATH_TRAVERSAL_START); processed = boost::regex_replace(processed, rePathTraversalStart, std::string("")); } // End assertions - only a single ')' can follow if (boost::regex_search(processed, match, reEndNonWordAhead) && match.position() >= 0) { flags.setFlag(AssertionFlag::END_NON_WORD_AHEAD); processed = boost::regex_replace(processed, reEndNonWordAhead, std::string("")); } else if (boost::regex_search(processed, match, reEndNonWordSpecial) && match.position() >= 0) { flags.setFlag(AssertionFlag::END_NON_WORD_SPECIAL); processed = boost::regex_replace(processed, reEndNonWordSpecial, std::string("")); } // Path traversal end assertion if (boost::regex_search(processed, match, rePathTraversalEnd) && match.position() >= 0) { flags.setFlag(AssertionFlag::PATH_TRAVERSAL_END); processed = boost::regex_replace(processed, rePathTraversalEnd, std::string("")); } // wildcard evasion regex group name starts with evasion_wildcard_regex if (groupName.find("evasion_wildcard_regex") == 0) { flags.setFlag(AssertionFlag::WILDCARD_EVASION); } return processed; } // Extracts the group name from a regex pattern string (e.g., (?P...)) std::string Signatures::extractGroupName(const std::string &pattern) { boost::regex namedGroupRegex(R"(\(\?P<([^>]+)>)"); boost::smatch match; if (boost::regex_search(pattern, match, namedGroupRegex)) { return match[1].str(); } return ""; } void Signatures::preprocessHyperscanPatterns() { std::map categoryCount; // Helper function to check if a pattern is hyperscan compatible auto isHyperscanCompatible = [&categoryCount](const std::string &pattern) -> bool { // Hyperscan doesn't support certain regex features that we can't easily convert static const std::vector incompatibleFeatures = { R"((?!\w)", R"((?)", R"((?&)", R"((?|)", R"((?P<)", // Atomic groups, named groups, and branching R"((?R)" // Recursion }; for (const auto &feature : incompatibleFeatures) { if (pattern.find(feature) != std::string::npos) { dbgInfo(D_WAAP_HYPERSCAN) << "Incompatible feature found: " << feature << " in pattern: " << pattern; categoryCount[feature]++; return false; } } boost::regex backrefRegex(R"(\(\\\d+\))"); if (boost::regex_search(pattern, backrefRegex)) { dbgInfo(D_WAAP_HYPERSCAN) << "Incompatible backreference found: " << pattern; categoryCount["backreference"]++; return false; } return true; }; // Helper function to convert regex pattern to hyperscan-compatible format auto convertToHyperscanPattern = [](const std::string &originalPattern) -> std::string { std::string converted = originalPattern; // Remove named group syntax - convert (?P...) to ... boost::regex namedGroupRegex(R"(\(\?P<[^>]+>)"); if (boost::regex_search(converted, namedGroupRegex)) { std::string end_str = ")"; if (converted.back() == ')') { converted.pop_back(); // Remove the trailing ')' end_str = ""; } converted = boost::regex_replace(converted, namedGroupRegex, end_str); } // Handle atomic groups first (before removing word boundaries) // Hyperscan doesn't support atomic groups, so we need to convert them // Convert (?>\b) to nothing (remove word boundary atomic groups) converted = boost::regex_replace(converted, boost::regex(R"(\(\?\>\\b\))"), std::string("")); // Convert (?>\B) to nothing (remove non-word boundary atomic groups) converted = boost::regex_replace(converted, boost::regex(R"(\(\?\>\\B\))"), std::string("")); // Convert empty atomic groups (?>) to nothing converted = boost::regex_replace(converted, boost::regex(R"(\(\?\>\))"), std::string("")); // // Now remove remaining word boundaries (not supported by Hyperscan) // // At this point, any \b or \B that was inside atomic groups has been handled above // converted = boost::regex_replace(converted, boost::regex(R"(\\b)"), std::string("")); // converted = boost::regex_replace(converted, boost::regex(R"(\\B)"), std::string("")); return converted; }; // Helper function to get patterns from sigsSource for each category auto getCommonPatternsForCategory = [this](const std::string &category, const std::string ®exSource) -> std::vector { std::vector patterns; // Map regexSource/category to the JSON key in sigsSource std::string key; if (regexSource == "specific_acuracy_keywords_regex" || category == "specific_accuracy") { key = "specific_acuracy_keywords_regex_list"; } else if (regexSource == "words_regex" || category == "keywords") { key = "words_regex_list"; } else if (regexSource == "pattern_regex" || category == "patterns") { key = "pattern_regex_list"; } else { // Fallback: allow passing the exact key name key = regexSource; dbgDebug(D_WAAP_HYPERSCAN) << "Unknown category/regexSource: " << category << "/" << regexSource << ". Using regexSource as key."; } // Fetch patterns directly from sigsSource if available auto it = sigsSource.find(key); if (it != sigsSource.end()) { try { patterns = to_strvec(it->second.get()); } catch (...) { // If the type is unexpected, return empty and continue gracefully patterns.clear(); dbgWarning(D_WAAP_HYPERSCAN) << "Unexpected type for key: " << key; } } return patterns; }; // Process specific_acuracy_keywords_regex patterns std::vector incompatiblePatterns; { auto patterns = getCommonPatternsForCategory("specific_accuracy", "specific_acuracy_keywords_regex"); for (const auto &pattern : patterns) { AssertionFlags flags; std::string groupName = extractGroupName(pattern); std::string processedPattern = convertToHyperscanPattern(pattern); std::string hyperscanPattern = processAssertions(groupName, processedPattern, flags); if (hyperscanPattern != pattern) { dbgTrace(D_WAAP_HYPERSCAN) << pattern << " -> " << hyperscanPattern; } if (isHyperscanCompatible(hyperscanPattern)) { HyperscanPattern hsPattern; hsPattern.originalPattern = pattern; hsPattern.hyperscanPattern = hyperscanPattern; hsPattern.category = "specific_accuracy"; hsPattern.regexSource = "specific_acuracy_keywords_regex"; hsPattern.groupName = groupName; if (hsPattern.groupName.empty()) { hsPattern.groupName = "specific_accuracy_match"; } hsPattern.isFastReg = (hsPattern.groupName.find("fast_reg") != std::string::npos); hsPattern.isEvasion = (hsPattern.groupName.find("evasion") != std::string::npos); m_keywordHyperscanPatterns.push_back(hsPattern); m_keywordAssertionFlags.push_back(flags); } else { incompatiblePatterns.push_back(pattern); } } } // Process words_regex patterns { auto patterns = getCommonPatternsForCategory("keywords", "words_regex"); for (const auto &pattern : patterns) { AssertionFlags flags; std::string groupName = extractGroupName(pattern); std::string processedPattern = convertToHyperscanPattern(pattern); std::string hyperscanPattern = processAssertions(groupName, processedPattern, flags); if (hyperscanPattern != pattern) { dbgTrace(D_WAAP_HYPERSCAN) << pattern << " -> " << hyperscanPattern; } if (isHyperscanCompatible(hyperscanPattern)) { HyperscanPattern hsPattern; hsPattern.originalPattern = pattern; hsPattern.hyperscanPattern = hyperscanPattern; hsPattern.category = "keywords"; hsPattern.regexSource = "words_regex"; hsPattern.groupName = groupName; if (hsPattern.groupName.empty()) { hsPattern.groupName = "keywords_match"; } hsPattern.isFastReg = (hsPattern.groupName.find("fast_reg") != std::string::npos); hsPattern.isEvasion = (hsPattern.groupName.find("evasion") != std::string::npos); m_keywordHyperscanPatterns.push_back(hsPattern); m_keywordAssertionFlags.push_back(flags); } else { incompatiblePatterns.push_back(pattern); } } } // Process pattern_regex patterns { auto patterns = getCommonPatternsForCategory("patterns", "pattern_regex"); for (const auto &pattern : patterns) { AssertionFlags flags; std::string groupName = extractGroupName(pattern); std::string processedPattern = convertToHyperscanPattern(pattern); std::string hyperscanPattern = processAssertions(groupName, processedPattern, flags); if (hyperscanPattern != pattern) { dbgTrace(D_WAAP_HYPERSCAN) << pattern << " -> " << hyperscanPattern; } if (isHyperscanCompatible(hyperscanPattern)) { HyperscanPattern hsPattern; hsPattern.originalPattern = pattern; hsPattern.hyperscanPattern = hyperscanPattern; hsPattern.category = "patterns"; hsPattern.regexSource = "pattern_regex"; hsPattern.groupName = groupName; if (hsPattern.groupName.empty()) { hsPattern.groupName = "patterns_match"; } hsPattern.isFastReg = (hsPattern.groupName.find("fast_reg") != std::string::npos); hsPattern.isEvasion = (hsPattern.groupName.find("evasion") != std::string::npos); m_patternHyperscanPatterns.push_back(hsPattern); m_patternAssertionFlags.push_back(flags); } else { incompatiblePatterns.push_back(pattern); } } } dbgInfo(D_WAAP_HYPERSCAN) << "Preprocessed Hyperscan patterns: " << "keywords=" << m_keywordHyperscanPatterns.size() << ", patterns=" << m_patternHyperscanPatterns.size() << ", incompatible=" << incompatiblePatterns.size(); for (const auto &it : categoryCount) { dbgInfo(D_WAAP_HYPERSCAN) << "Feature: " << it.first << ", Count: " << it.second; } // Convert incompatible patterns to PmWordSet for traditional regex processing if (m_regexPreconditions && !incompatiblePatterns.empty()) { for (const auto &pattern : incompatiblePatterns) { Waap::RegexPreconditions::WordIndex wordIndex = m_regexPreconditions->getWordByRegex(pattern); if (wordIndex != Waap::RegexPreconditions::emptyWordIndex) { m_incompatiblePatternsPmWordSet.insert(wordIndex); } } dbgInfo(D_WAAP_HYPERSCAN) << "Created PmWordSet for " << m_incompatiblePatternsPmWordSet.size() << " incompatible patterns (from " << incompatiblePatterns.size() << " total)"; } } picojson::value::object Signatures::loadSource(const std::string &waapDataFileName) { picojson::value doc; std::ifstream f(waapDataFileName); if (f.fail()) { dbgError(D_WAAP) << "Failed to open json data file '" << waapDataFileName << "'!"; error = true; // flag an error return picojson::value::object(); } int length; f.seekg(0, std::ios::end); // go to the end length = f.tellg(); // report location (this is the length) char *buffer = new char[length]; // allocate memory for a buffer of appropriate dimension f.seekg(0, std::ios::beg); // go back to the beginning f.read(buffer, length); // read the whole file into the buffer f.close(); std::string dataObfuscated(buffer, length); delete[] buffer; std::stringstream ss(dataObfuscated); ss >> doc; if (!picojson::get_last_error().empty()) { dbgError(D_WAAP) << "WaapAssetState::loadSource('" << waapDataFileName << "') failed (parse error: '" << picojson::get_last_error() << "')."; error = true; // flag an error return picojson::value::object(); } return doc.get()["waap_signatures"].get(); } const std::vector &Signatures::getKeywordHyperscanPatterns() const { return m_keywordHyperscanPatterns; } const std::vector &Signatures::getPatternHyperscanPatterns() const { return m_patternHyperscanPatterns; } const std::vector &Signatures::getKeywordAssertionFlags() const { return m_keywordAssertionFlags; } const std::vector &Signatures::getPatternAssertionFlags() const { return m_patternAssertionFlags; } const Waap::RegexPreconditions::PmWordSet &Signatures::getIncompatiblePatternsPmWordSet() const { return m_incompatiblePatternsPmWordSet; } void Signatures::processRegexMatch(const std::string &groupName, const std::string &groupValue, std::string &word, std::vector &keyword_matches, Waap::Util::map_of_stringlists_t &found_patterns, bool longTextFound, bool binaryDataFound) const { std::string group = groupName; if (group == "") { return; // skip unnamed group } const std::string &value = groupValue; dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: group name='" << group << "' value='" << value << "', word='" << word << "':"; if (group.find("fast_reg") != std::string::npos) { dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: found '*fast_reg*' in group name"; if (group.find("evasion") != std::string::npos) { dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: found both 'fast_reg' and 'evasion' in group name."; word = "encoded_" + repr_uniq(value); if (word == "encoded_") { dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: empty word after repr_uniq: resetting word to 'character_encoding'" " and group to 'evasion'."; word = "character_encoding"; } else if (Waap::Util::str_isalnum(word)) { dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: isalnum word after repr_uniq: resetting group to 'evasion'."; // If the found match is alphanumeric (we've seen strings like "640x480" match) // we still should assume evasion but it doesn't need to include "fast_reg", // which would cause unconditional report to stage2 and hit performance... // This is why we remove the word "fast_reg" from the group name. group = "evasion"; } if (longTextFound) { dbgTrace(D_WAAP_SAMPLE_SCAN) << "checkRegex: longTextFound so resetting group name to 'longtext'"; group = "longtext"; } } else { word = group; } } // In sequences detected as "longTextFound" or "longBinaryFound", do not add words in the // "keyword_matches" list that: // - starts with "encoded_" // - or startswith("\") // - or equal to "character_encoding" if ((longTextFound || binaryDataFound) && (word == "character_encoding" || word.substr(0, 1) == "\\" || word.substr(0, 8) == "encoded_")) { // For now, do not skip // TODO - check if skipping improves detection dbgTrace(D_WAAP_SAMPLE_SCAN) << "longText/binaryData found with character_encoding"; } else if (binaryDataFound && (isShortWord(word) || isShortHtmlTag(word) || NGEN::Regex::regexMatch(__FILE__, __LINE__, group, binary_data_kw_filter))) { dbgTrace(D_WAAP_SAMPLE_SCAN) << "Not adding group='" << group << "', word='" << word << "' - due to binary data"; return; } else if ((std::find(keyword_matches.begin(), keyword_matches.end(), word) == keyword_matches.end())) { // python: if (word not in current_matches): current_matches.append(word) keyword_matches.push_back(word); dbgTrace(D_WAAP_SAMPLE_SCAN) << "added keyword match for group='" << group << "', value='" << value << "', word='" << word << "'"; } // python: // if group not in found_patterns: // found_patterns[group]=[] if (found_patterns.find(group) == found_patterns.end()) { found_patterns[group] = std::vector(); } // python: // if value not in found_patterns[group]: // found_patterns[group].append(value) if (std::find(found_patterns[group].begin(), found_patterns[group].end(), value) == found_patterns[group].end()) { found_patterns[group].push_back(value); dbgTrace(D_WAAP_SAMPLE_SCAN) << "added pattern match for group='" << group << "', value='" << value << "', word='" << word << "'"; } } bool Signatures::isHyperscanInitialized() const { return m_hyperscanInitialized; } void Signatures::setHyperscanInitialized(bool initialized) { m_hyperscanInitialized = initialized; } bool Signatures::shouldUseHyperscan(bool force) { // This can be controlled by environment variable or configuration static bool useHyperscan = false; #ifdef USE_HYPERSCAN static bool checked = false; if (!checked || force) { // Check environment variable first const char *env = getenv("WAAP_USE_HYPERSCAN"); if (env) { useHyperscan = (strcmp(env, "1") == 0 || strcasecmp(env, "true") == 0); dbgDebug(D_WAAP_SAMPLE_SCAN) << "Hyperscan usage set by environment: " << useHyperscan; } else { // Default to false to maintain backward compatibility - Hyperscan is opt-in useHyperscan = false; dbgDebug(D_WAAP_SAMPLE_SCAN) << "Hyperscan usage default (disabled): " << useHyperscan; } checked = true; } #endif // USE_HYPERSCAN return useHyperscan; }