update code to support brotli

This commit is contained in:
Ned Wright
2026-01-04 11:39:41 +00:00
parent 2105628f05
commit 041a463390
22 changed files with 3292 additions and 81 deletions

View File

@@ -0,0 +1,127 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <streambuf>
#include <vector>
#include <memory>
#include "compression_utils.h"
#include "debug.h"
USE_DEBUG_FLAG(D_WAAP_SERIALIZE);
// Forward declarations
class WaapComponent;
void yieldIfPossible(const std::string &func, int line);
#define YIELD_IF_POSSIBLE() yieldIfPossible(__FUNCTION__, __LINE__)
//
// Buffered output stream that compresses and encrypts data when flushing
//
// Usage example:
// std::stringstream ss;
// BufferedCompressedOutputStream compressed_stream(ss);
// compressed_stream << "Hello, World!";
// compressed_stream.flush(); // Data is compressed, encrypted, and written to ss
class BufferedCompressedOutputStream : public std::ostream
{
public:
explicit BufferedCompressedOutputStream(std::ostream &underlying_stream);
~BufferedCompressedOutputStream();
// Manual flush to compress, encrypt and write data
void flush();
void close();
private:
class CompressedBuffer : public std::streambuf, Singleton::Consume<I_Encryptor>
{
public:
explicit CompressedBuffer(std::ostream &underlying_stream);
~CompressedBuffer();
// Public method to flush the buffer
void flushAndClose();
void flushBuffer();
protected:
virtual int overflow(int c) override;
virtual std::streamsize xsputn(const char* s, std::streamsize n) override;
virtual int sync() override;
private:
// Compress and encrypt buffer; is_last indicates final chunk
bool compressAndEncryptBuffer(bool is_last);
std::ostream &m_underlying_stream;
std::vector<char> m_buffer;
static const size_t BUFFER_SIZE = 16 * 1024; // 16KiB
CompressionStream* m_compression_stream;
bool m_closed;
};
std::unique_ptr<CompressedBuffer> m_buffer;
};
// Buffered input stream that decrypts and decompresses data when reading
//
// Usage example:
// std::stringstream ss("encrypted compressed data");
// BufferedCompressedInputStream decompressed_stream(ss);
// std::string line;
// std::getline(decompressed_stream, line); // Data is decrypted and decompressed
class BufferedCompressedInputStream : public std::istream
{
public:
explicit BufferedCompressedInputStream(std::istream &underlying_stream);
~BufferedCompressedInputStream();
private:
class DecompressedBuffer : public std::streambuf
{
public:
explicit DecompressedBuffer(std::istream &underlying_stream);
~DecompressedBuffer();
protected:
virtual int underflow() override;
virtual std::streamsize xsgetn(char* s, std::streamsize n) override;
private:
bool fillBuffer();
bool processNextChunk();
bool decryptChunk(const std::vector<char> &encrypted_chunk, std::vector<char> &decrypted_chunk);
bool decompressChunk(const std::vector<char> &compressed_chunk, std::vector<char> &decompressed_chunk);
std::istream &m_underlying_stream;
std::vector<char> m_buffer; // Output buffer for decompressed data
std::vector<char> m_encrypted_buffer; // Buffer for encrypted data from stream
std::vector<char> m_compressed_buffer; // Buffer for decrypted but still compressed data
std::vector<char> m_decompressed_buffer; // Buffer for decompressed data chunks
size_t m_decompressed_pos; // Current position in decompressed buffer
static const size_t OUTPUT_BUFFER_SIZE = 64 * 1024; // 64KiB output buffer
static const size_t CHUNK_SIZE = 16 * 1024; // 16KiB chunks for processing
CompressionStream* m_compression_stream;
bool m_eof_reached;
bool m_stream_finished; // Whether we've finished processing the entire stream
};
std::unique_ptr<DecompressedBuffer> m_buffer;
};

View File

@@ -0,0 +1,35 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __ASSERTION_REGEXES_H__
#define __ASSERTION_REGEXES_H__
#include <boost/regex.hpp>
namespace Waap {
namespace AssertionRegexes {
// Static const boost regexes used in processAssertions() function
// These regexes detect various assertion patterns in regex strings
// The patterns are in a separate file to avoid this codestyle checker issue:
// "error T009: comma should be followed by whitespace"
static const boost::regex reStartNonWordBehind(R"(\(\?<!\\w\))"); // (?<!\w)
static const boost::regex reEndNonWordAhead(R"(\(\?!\\w\))"); // (?!\w)
static const boost::regex reEndNonWordSpecial(R"(\(\?=\[\^\\w\?<>:=\]\|\$\))"); // (?=[^\w?<>:=]|$)
static const boost::regex rePathTraversalStart(R"(\(\?<!\[\\\.\,:\]\))"); // (?<![\.,:])
static const boost::regex rePathTraversalEnd(R"(\(\?!\[\\\.\,:\]\))"); // (?![\.,:])
} // namespace AssertionRegexes
} // namespace Waap
#endif // __ASSERTION_REGEXES_H__

View File

@@ -0,0 +1,339 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "UnifiedIndicatorsContainer.h"
#include <cereal/archives/json.hpp>
#include <cereal/types/string.hpp>
#include <cereal/types/vector.hpp>
#include <algorithm>
using std::string;
using std::unordered_map;
using std::unordered_set;
using std::ostream;
using std::istream;
// -------------------------------
// Interning helpers
// -------------------------------
const std::string*
UnifiedIndicatorsContainer::internValue(const std::string &value)
{
auto it = valuePool.find(value);
if (it == valuePool.end()) it = valuePool.insert(value).first;
return &(*it);
}
const std::string*
UnifiedIndicatorsContainer::internSource(const std::string &source)
{
auto it = sourcesPool.find(source);
if (it == sourcesPool.end()) it = sourcesPool.insert(source).first;
return &(*it);
}
// -------------------------------
// Public API
// -------------------------------
void
UnifiedIndicatorsContainer::addIndicator(
const std::string &key,
const std::string &value,
IndicatorType type,
const std::string &source)
{
auto &filters = filtersDataPerKey[key];
const std::string *valPtr = internValue(value);
const std::string *srcPtr = internSource(source);
FilterData &bucket = (type == IndicatorType::KEYWORD)
? filters.getIndicators()
: filters.getTypes();
auto &srcSet = bucket[const_cast<std::string*>(valPtr)];
srcSet.insert(const_cast<std::string*>(srcPtr));
// Update per-key total sources union
filters.getTotalSources().insert(const_cast<std::string*>(srcPtr));
}
void UnifiedIndicatorsContainer::addEntry(const Entry &entry)
{
const std::string *srcPtr = internSource(entry.sourceId);
if (entry.isTrusted && srcPtr) {
trustedSources.insert(srcPtr);
}
for (const auto &val : entry.indicators) {
addIndicator(entry.key, val, IndicatorType::KEYWORD, entry.sourceId);
}
for (const auto &val : entry.types) {
addIndicator(entry.key, val, IndicatorType::TYPE, entry.sourceId);
}
}
bool
UnifiedIndicatorsContainer::hasIndicator(
const std::string &key,
const std::string &value,
IndicatorType type) const
{
auto keyIt = filtersDataPerKey.find(key);
if (keyIt == filtersDataPerKey.end()) return false;
const Filters &filters = keyIt->second;
const FilterData &bucket = (type == IndicatorType::KEYWORD)
? filters.getIndicators()
: filters.getTypes();
auto valIt = valuePool.find(value);
if (valIt == valuePool.end()) return false;
auto it = bucket.find(const_cast<std::string*>(&(*valIt)));
return it != bucket.end();
}
std::unordered_set<std::string>
UnifiedIndicatorsContainer::getSources(
const std::string &key,
const std::string &value,
IndicatorType type) const
{
std::unordered_set<std::string> out;
auto keyIt = filtersDataPerKey.find(key);
if (keyIt == filtersDataPerKey.end()) return out;
const Filters &filters = keyIt->second;
const FilterData &bucket = (type == IndicatorType::KEYWORD)
? filters.getIndicators()
: filters.getTypes();
auto valIt = valuePool.find(value);
if (valIt == valuePool.end()) return out;
auto it = bucket.find(const_cast<std::string*>(&(*valIt)));
if (it == bucket.end()) return out;
for (auto p : it->second) if (p) out.insert(*p);
return out;
}
size_t
UnifiedIndicatorsContainer::getIndicatorCount() const
{
size_t count = 0;
for (const auto &k : filtersDataPerKey) {
count += k.second.getIndicators().size();
count += k.second.getTypes().size();
}
return count;
}
size_t
UnifiedIndicatorsContainer::getKeyCount() const
{
return filtersDataPerKey.size();
}
size_t
UnifiedIndicatorsContainer::getValuePoolSize() const
{
return valuePool.size();
}
void
UnifiedIndicatorsContainer::clear()
{
filtersDataPerKey.clear();
valuePool.clear();
sourcesPool.clear();
trustedSources.clear();
}
// -------------------------------
// Serialization
// -------------------------------
void
UnifiedIndicatorsContainer::serialize(std::ostream &stream) const
{
cereal::JSONOutputArchive ar(stream);
// Write trustedSources as a named array under the root object (global trusted only)
ar.setNextName("trustedSources");
ar.startNode();
cereal::size_type n_trusted = static_cast<cereal::size_type>(trustedSources.size());
ar(cereal::make_size_tag(n_trusted));
for (auto p : trustedSources) ar(p ? *p : std::string());
ar.finishNode();
// logger: object of keys -> { totalSources: [...], indicators: {...}, types: {...} }
ar.setNextName("logger");
ar.startNode();
for (const auto &k : filtersDataPerKey) {
ar.setNextName(k.first.c_str());
ar.startNode();
// totalSources section (union per key)
ar.setNextName("totalSources");
ar.startNode();
const auto &ts = k.second.getTotalSources();
cereal::size_type ts_sz = static_cast<cereal::size_type>(ts.size());
ar(cereal::make_size_tag(ts_sz));
for (auto p : ts) ar(p ? *p : std::string());
ar.finishNode();
// indicators section
ar.setNextName("indicators");
ar.startNode();
for (const auto &kv : k.second.getIndicators()) {
const std::string *val = kv.first;
ar.setNextName(val ? val->c_str() : "");
ar.startNode();
cereal::size_type sz = static_cast<cereal::size_type>(kv.second.size());
ar(cereal::make_size_tag(sz));
for (auto p : kv.second) ar(p ? *p : std::string());
ar.finishNode(); // end value array
}
ar.finishNode(); // end indicators
// types section
ar.setNextName("types");
ar.startNode();
for (const auto &kv : k.second.getTypes()) {
const std::string *val = kv.first;
ar.setNextName(val ? val->c_str() : "");
ar.startNode();
cereal::size_type sz = static_cast<cereal::size_type>(kv.second.size());
ar(cereal::make_size_tag(sz));
for (auto p : kv.second) ar(p ? *p : std::string());
ar.finishNode(); // end value array
}
ar.finishNode(); // end types
ar.finishNode(); // end key object
}
ar.finishNode(); // end logger
}
void
UnifiedIndicatorsContainer::deserialize(std::istream &stream)
{
cereal::JSONInputArchive ar(stream);
clear();
// trustedSources (optional) as a named array
try {
ar.setNextName("trustedSources");
ar.startNode();
cereal::size_type n = 0;
ar(cereal::make_size_tag(n));
for (cereal::size_type i = 0; i < n; ++i) {
std::string s; ar(s);
const std::string *p = internSource(s);
trustedSources.insert(p);
}
ar.finishNode();
} catch (...) {
// Field may be absent
}
// logger
try {
ar.setNextName("logger");
ar.startNode();
while (true) {
const auto node_name = ar.getNodeName();
if (!node_name) break;
std::string key = node_name;
ar.startNode(); // enter key object
// totalSources (optional)
try {
ar.setNextName("totalSources");
ar.startNode();
cereal::size_type ts_sz = 0;
ar(cereal::make_size_tag(ts_sz));
auto &ts = filtersDataPerKey[key].getTotalSources();
for (cereal::size_type i = 0; i < ts_sz; ++i) {
std::string s; ar(s);
const std::string *p = internSource(s);
ts.insert(const_cast<std::string*>(p));
}
ar.finishNode();
} catch (...) {
// no totalSources
}
// indicators
try {
ar.setNextName("indicators");
ar.startNode();
while (true) {
const auto val_name = ar.getNodeName();
if (!val_name) break;
std::string value = val_name;
ar.startNode();
cereal::size_type sz = 0;
ar(cereal::make_size_tag(sz));
for (cereal::size_type i = 0; i < sz; ++i) {
std::string src; ar(src);
addIndicator(key, value, IndicatorType::KEYWORD, src);
}
ar.finishNode(); // end value array
}
ar.finishNode();
} catch (...) {
// no indicators
}
// types
try {
ar.setNextName("types");
ar.startNode();
while (true) {
const auto val_name = ar.getNodeName();
if (!val_name) break;
std::string value = val_name;
ar.startNode();
cereal::size_type sz = 0;
ar(cereal::make_size_tag(sz));
for (cereal::size_type i = 0; i < sz; ++i) {
std::string src; ar(src);
addIndicator(key, value, IndicatorType::TYPE, src);
}
ar.finishNode(); // end value array
}
ar.finishNode();
} catch (...) {
// no types
}
ar.finishNode(); // finish key object
}
ar.finishNode(); // finish logger
} catch (...) {
// Field may be absent
}
}
bool UnifiedIndicatorsContainer::isTrustedSource(const std::string &source) const {
// Linear check via interning: attempt to find an interned pointer matching source
// We maintain sourcesPool mapping actual std::string storage, so compare by value.
for (const auto &p : trustedSources) {
if (p && *p == source) return true;
}
return false;
}

View File

@@ -0,0 +1,220 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <vector>
#include <iostream>
#include <cereal/cereal.hpp>
#include <cereal/archives/json.hpp>
#include "i_serialize.h"
// #include "custom_serialization.h"
// Indicator type enumeration for type safety and compactness
enum class IndicatorType : uint8_t {
KEYWORD = 0,
TYPE = 1
};
typedef std::unordered_set<std::string*> SourcesSet;
typedef std::unordered_map<std::string*, SourcesSet> FilterData;
// Proposed name for `Filters`: KeyLog (represents the per-key section under "logger")
// Keeping class name as Filters to minimize changes; can be renamed in a follow-up.
class Filters {
public:
Filters() = default;
~Filters() = default;
// Const overload for cereal serialization
template<class Archive>
void serialize(Archive& ar) const {
std::vector<std::string> totalSourcesVec;
std::unordered_map<std::string, std::vector<std::string>> indicatorsMap, typesMap;
for (auto p : totalSources) {
if (p) totalSourcesVec.push_back(*p);
}
for (const auto& kv : indicators) {
std::string key = kv.first ? *kv.first : std::string();
std::vector<std::string> sources;
for (auto p : kv.second) {
if (p) sources.push_back(*p);
}
indicatorsMap[key] = sources;
}
for (const auto& kv : types) {
std::string key = kv.first ? *kv.first : std::string();
std::vector<std::string> sources;
for (auto p : kv.second) {
if (p) sources.push_back(*p);
}
typesMap[key] = sources;
}
ar(
cereal::make_nvp("totalSources", totalSourcesVec),
cereal::make_nvp("indicators", indicatorsMap),
cereal::make_nvp("types", typesMap)
);
}
// Accessors for container implementation
FilterData & getIndicators() { return indicators; }
FilterData & getTypes() { return types; }
const FilterData & getIndicators() const { return indicators; }
const FilterData & getTypes() const { return types; }
// Per-key total sources (union of sources from indicators and types)
SourcesSet & getTotalSources() { return totalSources; }
const SourcesSet & getTotalSources() const { return totalSources; }
private:
FilterData indicators;
FilterData types;
SourcesSet totalSources;
};
// Unified indicators container with string interning and memory optimization
class UnifiedIndicatorsContainer {
public:
// Batch entry input
struct Entry {
std::string key;
std::string sourceId;
bool isTrusted = false;
std::vector<std::string> indicators; // values treated as KEYWORD
std::vector<std::string> types; // values treated as TYPE
};
void addEntry(const Entry& entry);
// Check if an indicator exists
bool hasIndicator(const std::string& key, const std::string& value, IndicatorType type) const;
// Get all sources for a specific indicator
std::unordered_set<std::string> getSources(const std::string& key,
const std::string& value,
IndicatorType type) const;
// Statistics and metrics
size_t getIndicatorCount() const;
size_t getKeyCount() const;
size_t getValuePoolSize() const;
// Returns true if the given source string is marked as trusted (appears in the global trustedSources set)
bool isTrustedSource(const std::string &source) const;
// Container management
void clear();
// Serialization for cross-agent compatibility
// void serialize(std::ostream& stream) const;
template<class Archive>
void serialize(Archive& ar) const {
// trustedSources as array
std::vector<std::string> trusted_srcs;
for (auto p : trustedSources) {
if (p) trusted_srcs.push_back(*p);
}
ar.setNextName("trustedSources");
ar.startNode();
cereal::size_type n_trusted = static_cast<cereal::size_type>(trusted_srcs.size());
ar(cereal::make_size_tag(n_trusted));
for (const auto &s : trusted_srcs) ar(s);
ar.finishNode();
// logger: object of keys -> { totalSources: [...], indicators: {...}, types: {...} }
ar.setNextName("logger");
ar.startNode();
for (const auto &k : filtersDataPerKey) {
ar.setNextName(k.first.c_str());
ar.startNode();
// totalSources section (union per key)
ar.setNextName("totalSources");
ar.startNode();
const auto &ts = k.second.getTotalSources();
cereal::size_type ts_sz = static_cast<cereal::size_type>(ts.size());
ar(cereal::make_size_tag(ts_sz));
for (auto p : ts) ar(p ? *p : std::string());
ar.finishNode();
// indicators section
ar.setNextName("indicators");
ar.startNode();
for (const auto &kv : k.second.getIndicators()) {
const std::string *val = kv.first;
ar.setNextName(val ? val->c_str() : "");
ar.startNode();
cereal::size_type sz = static_cast<cereal::size_type>(kv.second.size());
ar(cereal::make_size_tag(sz));
for (auto p : kv.second) ar(p ? *p : std::string());
ar.finishNode(); // end value array
}
ar.finishNode(); // end indicators
// types section
ar.setNextName("types");
ar.startNode();
for (const auto &kv : k.second.getTypes()) {
const std::string *val = kv.first;
ar.setNextName(val ? val->c_str() : "");
ar.startNode();
cereal::size_type sz = static_cast<cereal::size_type>(kv.second.size());
ar(cereal::make_size_tag(sz));
for (auto p : kv.second) ar(p ? *p : std::string());
ar.finishNode(); // end value array
}
ar.finishNode(); // end types
ar.finishNode(); // end key object
}
ar.finishNode(); // end logger
}
void serialize(std::ostream &stream) const;
void deserialize(std::istream& stream);
private:
// Single indicator add
void addIndicator(const std::string& key, const std::string& value,
IndicatorType type, const std::string& source);
// String interning pool for values
std::unordered_set<std::string> valuePool;
// String interning pool for sources
std::unordered_set<std::string> sourcesPool;
// Main storage: key -> Filters
std::unordered_map<std::string, Filters> filtersDataPerKey;
// Global set of trusted sources
std::unordered_set<const std::string*> trustedSources;
// Helper methods
const std::string* internValue(const std::string& value);
const std::string* internSource(const std::string& source);
};
// UnifiedIndicatorsLogPost for REST, compatible with cereal and messaging
class UnifiedIndicatorsLogPost : public RestGetFile {
public:
UnifiedIndicatorsLogPost(std::shared_ptr<UnifiedIndicatorsContainer> container_ptr)
{
unifiedIndicators = std::move(*container_ptr);
}
private:
C2S_PARAM(UnifiedIndicatorsContainer, unifiedIndicators);
};

View File

@@ -0,0 +1,767 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "WaapHyperscanEngine.h"
#include "Signatures.h"
#include "ScanResult.h"
#include "WaapSampleValue.h"
#include "Waf2Regex.h"
#include "Waf2Util.h"
#include "debug.h"
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <regex>
#ifdef USE_HYPERSCAN
#include "hs.h"
#endif
USE_DEBUG_FLAG(D_WAAP_SAMPLE_SCAN);
USE_DEBUG_FLAG(D_WAAP_HYPERSCAN);
#ifdef USE_HYPERSCAN
static const unsigned int HS_STANDARD_FLAGS = HS_FLAG_CASELESS | HS_FLAG_SOM_LEFTMOST;
#endif // USE_HYPERSCAN
static const bool matchOriginalPattern = true;
static const size_t maxRegexValidationMatches = 10;
class WaapHyperscanEngine::Impl {
public:
struct PatternInfo {
std::string originalPattern;
std::string hyperscanPattern;
std::string groupName;
std::string category; // "keywords", "specific_accuracy", "patterns"
bool isFastReg;
bool isEvasion;
std::string regexSource; // "specific_acuracy_keywords_regex", "words_regex", "pattern_regex"
Signatures::AssertionFlags assertionFlags; // Zero-length assertion flags
std::unique_ptr<SingleRegex> originalRegex; // Precompiled original pattern for validation
PatternInfo() : isFastReg(false), isEvasion(false) {}
};
struct MatchContext {
const WaapHyperscanEngine::Impl* engine;
const std::string* sampleText;
std::vector<std::string>* keyword_matches;
std::vector<std::string>* regex_matches;
Waap::Util::map_of_stringlists_t* found_patterns;
bool longTextFound;
bool binaryDataFound;
bool includePatternRegex;
bool includeKeywordRegex;
// Per-signature tracking of last match end (pattern id => last end offset)
std::unordered_map<unsigned int, size_t> lastMatchEndPerSignature;
};
Impl();
~Impl();
bool initialize(const std::shared_ptr<Signatures>& signatures);
void scanSample(const SampleValue& sample,
Waf2ScanResult& res,
bool longTextFound,
bool binaryDataFound,
bool includeKeywordRegex,
bool includePatternRegex) const;
bool isInitialized() const { return m_isInitialized; }
size_t getPatternCount() const { return m_patternInfos.size(); }
size_t getCompiledPatternCount() const { return m_compiledPatternCount; }
size_t getFailedPatternCount() const { return m_failedPatternCount; }
private:
#ifdef USE_HYPERSCAN
hs_database_t* m_keywordDatabase;
hs_database_t* m_patternDatabase;
hs_scratch_t* m_keywordScratch;
hs_scratch_t* m_patternScratch;
#endif
std::shared_ptr<Signatures> m_Signatures;
std::vector<PatternInfo> m_patternInfos;
bool m_isInitialized;
size_t m_compiledPatternCount;
size_t m_failedPatternCount;
// Helper methods
bool compileHyperscanDatabases(const std::shared_ptr<Signatures>& signatures);
void loadPrecompiledPatterns(const std::shared_ptr<Signatures>& signatures);
// use an ordered set to keep PCRE2-validated matches sorted and unique in input order
// LCOV_EXCL_START Reason: Trivial
struct Match {
size_t from;
size_t to;
Match(size_t from, size_t to) : from(from), to(to) {}
bool operator<(const Match& other) const {
return (from < other.from) || (from == other.from && to < other.to);
}
};
// LCOV_EXCL_STOP
// Assertion validation helpers
bool validateAssertions(const std::string& sampleText,
size_t matchStart,
size_t matchEnd,
const PatternInfo& patternInfo,
std::set<Match> &foundMatches,
size_t maxMatches) const;
static bool isWordChar(char c);
static bool isNonWordSpecialChar(char c);
#ifdef USE_HYPERSCAN
// Hyperscan callback function
static int onMatch(unsigned int id,
unsigned long long from,
unsigned long long to,
unsigned int flags,
void* context);
void processMatch(unsigned int id,
unsigned long long from,
unsigned long long to,
MatchContext* context) const;
void identifyFailingPatterns(const std::vector<std::string>& patterns,
const std::vector<PatternInfo>& hsPatterns,
const std::string& logPrefix) {
for (size_t i = 0; i < patterns.size(); ++i) {
const char *single_pattern = patterns[i].c_str();
unsigned int single_flag = HS_STANDARD_FLAGS;
unsigned int single_id = static_cast<unsigned int>(i);
hs_database_t *test_db = nullptr;
hs_compile_error_t *single_err = nullptr;
hs_error_t single_result = hs_compile_multi(&single_pattern,
&single_flag,
&single_id,
1,
HS_MODE_BLOCK,
nullptr,
&test_db,
&single_err);
if (single_result != HS_SUCCESS) {
std::string additional_info = "";
if (i < hsPatterns.size()) {
const auto &hsPattern = hsPatterns[i];
additional_info = " | Category: " + hsPattern.category +
" | Group: " + hsPattern.groupName +
" | Source: " + hsPattern.regexSource;
if (!hsPattern.originalPattern.empty() &&
hsPattern.originalPattern != hsPattern.hyperscanPattern) {
additional_info += " | Original: '" + hsPattern.originalPattern + "'";
}
}
dbgWarning(D_WAAP_HYPERSCAN)
<< logPrefix << " [" << i << "]: '" << patterns[i]
<< "' - Error: " << (single_err ? single_err->message : "unknown") << additional_info;
if (single_err) {
hs_free_compile_error(single_err);
single_err = nullptr;
}
} else {
if (test_db) {
hs_free_database(test_db);
test_db = nullptr;
}
}
if (single_err) {
hs_free_compile_error(single_err);
single_err = nullptr;
}
}
}
#endif // USE_HYPERSCAN
};
WaapHyperscanEngine::Impl::Impl()
:
#ifdef USE_HYPERSCAN
m_keywordDatabase(nullptr), m_patternDatabase(nullptr), m_keywordScratch(nullptr), m_patternScratch(nullptr),
#endif // USE_HYPERSCAN
m_isInitialized(false), m_compiledPatternCount(0), m_failedPatternCount(0)
{
}
WaapHyperscanEngine::Impl::~Impl()
{
#ifdef USE_HYPERSCAN
if (m_keywordScratch) hs_free_scratch(m_keywordScratch);
if (m_patternScratch) hs_free_scratch(m_patternScratch);
if (m_keywordDatabase) hs_free_database(m_keywordDatabase);
if (m_patternDatabase) hs_free_database(m_patternDatabase);
#endif
}
bool WaapHyperscanEngine::Impl::initialize(const std::shared_ptr<Signatures> &signatures)
{
if (!signatures) {
dbgWarning(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine::initialize: null signatures";
return false;
}
m_Signatures = signatures;
#ifdef USE_HYPERSCAN
m_isInitialized = compileHyperscanDatabases(signatures);
if (m_isInitialized) {
dbgInfo(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine initialized successfully. "
<< "Compiled: " << m_compiledPatternCount << ", Failed: " << m_failedPatternCount;
} else {
dbgWarning(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine initialization failed";
}
return m_isInitialized;
#else
dbgInfo(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine: Hyperscan not available on this platform";
return false;
#endif
}
bool WaapHyperscanEngine::Impl::compileHyperscanDatabases(const std::shared_ptr<Signatures> &signatures)
{
#ifdef USE_HYPERSCAN
// Load precompiled patterns from signatures instead of extracting at runtime
loadPrecompiledPatterns(signatures);
std::vector<std::string> keywordPatterns;
std::vector<std::string> patternRegexPatterns;
// Collect keyword patterns (from specific_accuracy and keywords categories)
auto keywordAssertionFlags = signatures->getKeywordAssertionFlags();
for (size_t i = 0; i < signatures->getKeywordHyperscanPatterns().size(); ++i) {
const auto &hsPattern = signatures->getKeywordHyperscanPatterns()[i];
keywordPatterns.push_back(hsPattern.hyperscanPattern);
PatternInfo info;
info.originalPattern = hsPattern.originalPattern;
info.hyperscanPattern = hsPattern.hyperscanPattern;
info.category = hsPattern.category;
info.regexSource = hsPattern.regexSource;
info.groupName = hsPattern.groupName;
info.isFastReg = hsPattern.isFastReg;
info.isEvasion = hsPattern.isEvasion;
// Set assertion flags if available
if (i < keywordAssertionFlags.size()) {
info.assertionFlags = keywordAssertionFlags[i];
}
// Compile original regex pattern for validation only when matchOriginal flag is set
if (!info.originalPattern.empty() && matchOriginalPattern) {
bool regexError = false;
info.originalRegex = std::make_unique<SingleRegex>(
info.originalPattern, regexError, "ValidationRegex_" + info.groupName + "_" + std::to_string(i));
if (regexError) {
dbgWarning(D_WAAP_HYPERSCAN)
<< "Failed to compile original regex for pattern: " << info.originalPattern
<< " (group: " << info.groupName << ")";
info.originalRegex.reset(); // Clear failed regex
}
}
m_patternInfos.push_back(std::move(info));
}
// Collect pattern regex patterns (from patterns category)
auto patternAssertionFlags = signatures->getPatternAssertionFlags();
for (size_t i = 0; i < signatures->getPatternHyperscanPatterns().size(); ++i) {
const auto &hsPattern = signatures->getPatternHyperscanPatterns()[i];
patternRegexPatterns.push_back(hsPattern.hyperscanPattern);
PatternInfo info;
info.originalPattern = hsPattern.originalPattern;
info.hyperscanPattern = hsPattern.hyperscanPattern;
info.category = hsPattern.category;
info.regexSource = hsPattern.regexSource;
info.groupName = hsPattern.groupName;
info.isFastReg = hsPattern.isFastReg;
info.isEvasion = hsPattern.isEvasion;
// Set assertion flags if available
if (i < patternAssertionFlags.size()) {
info.assertionFlags = patternAssertionFlags[i];
}
// Compile original regex pattern for validation only when matchOriginal flag is set
if (!info.originalPattern.empty() && matchOriginalPattern) {
bool regexError = false;
size_t patternIndex = keywordPatterns.size() + i; // Offset by keyword patterns count
info.originalRegex = std::make_unique<SingleRegex>(info.originalPattern, regexError,
"ValidationRegex_" + info.groupName + "_" + std::to_string(patternIndex));
if (regexError) {
dbgWarning(D_WAAP_HYPERSCAN)
<< "Failed to compile original regex for pattern: " << info.originalPattern
<< " (group: " << info.groupName << ")";
info.originalRegex.reset(); // Clear failed regex
}
}
m_patternInfos.push_back(std::move(info));
}
dbgInfo(D_WAAP_HYPERSCAN) << "Using precompiled patterns: "
<< "keywords=" << keywordPatterns.size()
<< ", patterns=" << patternRegexPatterns.size();
// Compile keyword database (specific_acuracy_keywords_regex + words_regex)
size_t total_ids = 0;
if (!keywordPatterns.empty()) {
std::vector<const char *> c_patterns;
std::vector<unsigned int> flags;
std::vector<unsigned int> ids;
for (size_t i = 0; i < keywordPatterns.size(); ++i) {
c_patterns.push_back(keywordPatterns[i].c_str());
flags.push_back(HS_STANDARD_FLAGS);
ids.push_back(static_cast<unsigned int>(total_ids++));
}
// Defensive checks before calling hs_compile_multi
if (c_patterns.size() != flags.size() || c_patterns.size() != ids.size()) {
dbgWarning(D_WAAP_HYPERSCAN) << "Pattern, flag, and id arrays are not the same size!";
return false;
}
if (c_patterns.empty()) {
dbgWarning(D_WAAP_HYPERSCAN) << "No patterns to compile!";
return false;
}
dbgInfo(D_WAAP_HYPERSCAN) << "Compiling " << c_patterns.size()
<< " keyword patterns with hs_compile_multi. First pattern: '"
<< keywordPatterns[0] << "'";
hs_compile_error_t *compile_err = nullptr;
hs_error_t result =
hs_compile_multi(c_patterns.data(),
flags.data(),
ids.data(),
static_cast<unsigned int>(c_patterns.size()),
HS_MODE_BLOCK,
nullptr,
&m_keywordDatabase,
&compile_err);
if (result != HS_SUCCESS) {
std::string error_msg = compile_err ? compile_err->message : "unknown error";
dbgWarning(D_WAAP_HYPERSCAN) << "Failed to compile keyword database: " << error_msg;
// Try to identify the specific failing pattern(s)
if (compile_err) {
dbgWarning(D_WAAP_HYPERSCAN) << "Attempting to identify failing keyword pattern(s)...";
auto keywordHsPatterns = signatures->getKeywordHyperscanPatterns();
std::vector<PatternInfo> keywordPatternInfos;
keywordPatternInfos.reserve(keywordHsPatterns.size());
for (const auto& hsPattern : keywordHsPatterns) {
keywordPatternInfos.emplace_back();
PatternInfo& info = keywordPatternInfos.back();
info.originalPattern = hsPattern.originalPattern;
info.hyperscanPattern = hsPattern.hyperscanPattern;
info.category = hsPattern.category;
info.regexSource = hsPattern.regexSource;
info.groupName = hsPattern.groupName;
info.isFastReg = hsPattern.isFastReg;
info.isEvasion = hsPattern.isEvasion;
}
identifyFailingPatterns(keywordPatterns, keywordPatternInfos, "Failing keyword pattern");
}
if (compile_err) {
hs_free_compile_error(compile_err);
compile_err = nullptr;
}
return false;
}
if (hs_alloc_scratch(m_keywordDatabase, &m_keywordScratch) != HS_SUCCESS) {
dbgWarning(D_WAAP_HYPERSCAN) << "Failed to allocate keyword scratch space";
return false;
}
m_compiledPatternCount += keywordPatterns.size();
}
// Compile pattern database (pattern_regex)
if (!patternRegexPatterns.empty()) {
std::vector<const char *> c_patterns;
std::vector<unsigned int> flags;
std::vector<unsigned int> ids;
for (size_t i = 0; i < patternRegexPatterns.size(); ++i) {
c_patterns.push_back(patternRegexPatterns[i].c_str());
flags.push_back(HS_STANDARD_FLAGS);
ids.push_back(static_cast<unsigned int>(total_ids++));
}
// Defensive checks before calling hs_compile_multi
if (c_patterns.size() != flags.size() || c_patterns.size() != ids.size()) {
dbgWarning(D_WAAP_HYPERSCAN)
<< "Pattern, flag, and id arrays are not the same size! (patternRegexPatterns)";
return false;
}
if (c_patterns.empty()) {
dbgWarning(D_WAAP_HYPERSCAN) << "No pattern regex patterns to compile!";
return false;
}
dbgInfo(D_WAAP_HYPERSCAN) << "Compiling " << c_patterns.size()
<< " pattern regex patterns with hs_compile_multi. First pattern: '"
<< patternRegexPatterns[0] << "'";
hs_compile_error_t *compile_err = nullptr;
hs_error_t result =
hs_compile_multi(c_patterns.data(),
flags.data(),
ids.data(),
static_cast<unsigned int>(c_patterns.size()),
HS_MODE_BLOCK,
nullptr,
&m_patternDatabase,
&compile_err);
if (result != HS_SUCCESS) {
std::string error_msg = compile_err ? compile_err->message : "unknown error";
dbgWarning(D_WAAP_HYPERSCAN) << "Failed to compile pattern database: " << error_msg;
// Try to identify the specific failing pattern(s)
if (compile_err) {
dbgWarning(D_WAAP_HYPERSCAN) << "Attempting to identify failing pattern regex pattern(s)...";
auto patternHsPatterns = signatures->getPatternHyperscanPatterns();
std::vector<PatternInfo> patternPatternInfos;
patternPatternInfos.reserve(patternHsPatterns.size());
for (const auto& hsPattern : patternHsPatterns) {
patternPatternInfos.emplace_back();
PatternInfo& info = patternPatternInfos.back();
info.originalPattern = hsPattern.originalPattern;
info.hyperscanPattern = hsPattern.hyperscanPattern;
info.category = hsPattern.category;
info.regexSource = hsPattern.regexSource;
info.groupName = hsPattern.groupName;
info.isFastReg = hsPattern.isFastReg;
info.isEvasion = hsPattern.isEvasion;
}
identifyFailingPatterns(patternRegexPatterns, patternPatternInfos, "Failing pattern regex");
}
if (compile_err) {
hs_free_compile_error(compile_err);
compile_err = nullptr;
}
return false;
}
if (hs_alloc_scratch(m_patternDatabase, &m_patternScratch) != HS_SUCCESS) {
dbgWarning(D_WAAP_HYPERSCAN) << "Failed to allocate pattern scratch space";
return false;
}
m_compiledPatternCount += patternRegexPatterns.size();
}
return true;
#else // USE_HYPERSCAN
return false;
#endif // USE_HYPERSCAN
}
void WaapHyperscanEngine::Impl::loadPrecompiledPatterns(const std::shared_ptr<Signatures> &signatures)
{
// This method is called to initialize any additional pattern processing if needed
// For now, the patterns are directly accessed from the signatures object
dbgTrace(D_WAAP_HYPERSCAN) << "Loading precompiled patterns from Signatures";
m_Signatures = signatures;
}
#ifdef USE_HYPERSCAN
int WaapHyperscanEngine::Impl::onMatch(unsigned int id,
unsigned long long from,
unsigned long long to,
unsigned int flags,
void *context)
{
MatchContext *ctx = static_cast<MatchContext *>(context);
ctx->engine->processMatch(id, from, to, ctx);
return 0; // Continue scanning
}
void WaapHyperscanEngine::Impl::processMatch(unsigned int id,
unsigned long long from,
unsigned long long to,
MatchContext *context) const
{
if (id >= m_patternInfos.size()) {
dbgWarning(D_WAAP_HYPERSCAN) << "Invalid pattern ID: " << id;
return;
}
const PatternInfo &info = m_patternInfos[id];
const std::string &sampleText = *context->sampleText;
size_t start = static_cast<size_t>(from);
size_t end = static_cast<size_t>(to);
if (end > sampleText.length()) end = sampleText.length();
if (start >= end) return;
// skip overlaps for this pattern
size_t &lastEnd = context->lastMatchEndPerSignature[id];
if (start < lastEnd) {
dbgTrace(D_WAAP_HYPERSCAN) << "Skipping overlapping match for pattern id=" << id << " start=" << start
<< " lastEnd=" << lastEnd << ", match: '" << sampleText.substr(start, end - start)
<< "'";
return;
}
std::set<Match> foundMatches;
if (!validateAssertions(sampleText, start, end, info, foundMatches, maxRegexValidationMatches)) return;
for (const auto &match : foundMatches) {
std::string matchedText = sampleText.substr(match.from, match.to - match.from);
std::string word = matchedText;
dbgTrace(D_WAAP_HYPERSCAN) << " match='" << word << "' id='" << id << "' group='" << info.groupName
<< "' category=" << info.category;
if (context->binaryDataFound && word.size() <= 2) {
dbgTrace(D_WAAP_HYPERSCAN)
<< "Will not add a short keyword '" << word << "' because binaryData was found";
continue;
}
if (context->includeKeywordRegex && (info.category == "keywords" || info.category == "specific_accuracy")) {
m_Signatures->processRegexMatch(info.groupName, matchedText, word, *context->keyword_matches,
*context->found_patterns, context->longTextFound,
context->binaryDataFound);
} else if (context->includePatternRegex && info.category == "patterns") {
m_Signatures->processRegexMatch(info.groupName, matchedText, word, *context->regex_matches,
*context->found_patterns, context->longTextFound,
context->binaryDataFound);
}
lastEnd = std::max(lastEnd, match.to);
}
}
#endif // USE_HYPERSCAN
void WaapHyperscanEngine::Impl::scanSample(const SampleValue &sample, Waf2ScanResult &res, bool longTextFound,
bool binaryDataFound, bool includeKeywordRegex, bool includePatternRegex) const
{
#ifdef USE_HYPERSCAN
if (!m_isInitialized) {
dbgTrace(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine: not initialized, skipping scan";
return;
}
const std::string &sampleText = sample.getSampleString();
MatchContext context;
context.engine = this;
context.sampleText = &sampleText;
context.keyword_matches = &res.keyword_matches;
context.regex_matches = &res.regex_matches;
context.found_patterns = &res.found_patterns;
context.longTextFound = longTextFound;
context.binaryDataFound = binaryDataFound;
context.includePatternRegex = includePatternRegex;
context.includeKeywordRegex = includeKeywordRegex;
context.lastMatchEndPerSignature.clear();
dbgTrace(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine::scanSample: scanning '" << sampleText
<< "' longTextFound=" << longTextFound << " binaryDataFound=" << binaryDataFound
<< " includeKeywordRegex=" << includeKeywordRegex
<< " includePatternRegex=" << includePatternRegex;
if (includeKeywordRegex && m_keywordDatabase && m_keywordScratch) {
hs_error_t result =
hs_scan(m_keywordDatabase, sampleText.c_str(), static_cast<unsigned int>(sampleText.length()), 0,
m_keywordScratch, onMatch, &context);
if (result != HS_SUCCESS) {
dbgWarning(D_WAAP_HYPERSCAN) << "Keyword database scan failed: " << result;
}
}
if (includePatternRegex && m_patternDatabase && m_patternScratch) {
hs_error_t result =
hs_scan(m_patternDatabase, sampleText.c_str(), static_cast<unsigned int>(sampleText.length()), 0,
m_patternScratch, onMatch, &context);
if (result != HS_SUCCESS) {
dbgWarning(D_WAAP_HYPERSCAN) << "Pattern database scan failed: " << result;
}
}
dbgTrace(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine::scanSample: found " << res.keyword_matches.size()
<< " keyword matches, " << res.regex_matches.size() << " regex matches";
#else
dbgWarning(D_WAAP_HYPERSCAN) << "WaapHyperscanEngine::scanSample called but Hyperscan not available";
#endif
}
bool WaapHyperscanEngine::Impl::validateAssertions(const std::string &sampleText, size_t matchStart, size_t matchEnd,
const PatternInfo &patternInfo, std::set<Match> &foundMatches,
size_t maxMatches) const
{
foundMatches.clear();
// If we don't have an original regex compiled, fall back to the assertion flags validation
if (!patternInfo.originalRegex) {
dbgTrace(D_WAAP_HYPERSCAN) << "No original regex available for validation, "
<< "falling back to assertion flags check";
foundMatches.emplace(matchStart, matchEnd);
// If no assertion flags are set, the match is valid
if (patternInfo.assertionFlags.empty()) {
return true;
}
if (
patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::END_NON_WORD_AHEAD) &&
matchEnd < sampleText.length() &&
isWordChar(sampleText[matchEnd])) {
// (?!\w) - requires NO word character after the match
return false;
}
if (patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::START_NON_WORD_BEHIND) && matchStart > 0 &&
isWordChar(sampleText[matchStart - 1])) {
// (?<!\w) - requires NO word character before the match
return false;
}
// Check start assertions
if (patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::START_WORD_BEHIND) &&
(matchStart == 0 || !isWordChar(sampleText[matchStart - 1]))) {
// (?<=\w) - requires a word character before the match
return false;
}
// Check end assertions
if (patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::END_WORD_AHEAD) &&
(matchEnd >= sampleText.length() || !isWordChar(sampleText[matchEnd]))) {
// (?=\w) - requires a word character after the match
return false;
}
if (patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::END_NON_WORD_SPECIAL)) {
// (?=[^\w?<>:=]|$) - requires a non-word character (excluding ?<>:=) or end of string after the match
if (matchEnd < sampleText.length()) {
char nextChar = sampleText[matchEnd];
if (isWordChar(nextChar) || nextChar == '?' || nextChar == '<' || nextChar == '>' || nextChar == ':' ||
nextChar == '=') {
return false;
}
}
// If we're at the end of string, this condition is satisfied
}
return true;
}
if (patternInfo.assertionFlags.isSet(Signatures::AssertionFlag::WILDCARD_EVASION)) {
// skip if the match does not contain either type of slash, and not a question mark
bool hasSlash = false;
bool hasQuestionMark = false;
for (size_t i = matchStart; i < matchEnd && !(hasSlash && hasQuestionMark); ++i) {
if (sampleText[i] == '\\' || sampleText[i] == '/') {
hasSlash = true;
}
if (sampleText[i] == '?') {
hasQuestionMark = true;
}
}
dbgTrace(D_WAAP_HYPERSCAN) << "Testing for wildcard evasion: '"
<< " hasSlash=" << hasSlash << " hasQuestionMark=" << hasQuestionMark;
if (!hasSlash || !hasQuestionMark) {
return false;
}
}
// Use the original compiled regex to find matches within the specified range
std::vector<RegexMatchRange> matchRanges;
// look behind to cover possible assertions, look ahead much further to cover lazy hyperscan match end
static const size_t lookbehind_range = 4, lookahead_range = 32;
size_t searchStart = (matchStart > lookbehind_range) ? (matchStart - lookbehind_range) : 0UL;
size_t searchEnd = ((matchEnd + lookahead_range) < matchEnd || (matchEnd + lookahead_range) > sampleText.length())
? sampleText.length() // overflow
: (matchEnd + lookahead_range); // within bounds
std::vector<RegexMatchRange> regex_matches;
patternInfo.originalRegex->findMatchRanges(sampleText, regex_matches, maxMatches, searchStart, searchEnd);
for (const auto &match : regex_matches) {
foundMatches.emplace(match.start, match.end);
if (isDebugRequired(TRACE, D_WAAP_HYPERSCAN)) {
dbgTrace(D_WAAP_HYPERSCAN) << "Match for: '" << patternInfo.originalPattern << "' matched in range ["
<< match.start << "," << match.end << "] "
<< "matched text: '"
<< sampleText.substr(match.start, match.end - match.start)
<< "'";
}
}
if (foundMatches.empty()) {
if (isDebugRequired(TRACE, D_WAAP_HYPERSCAN)) {
dbgTrace(D_WAAP_HYPERSCAN) << "No match for: '" << patternInfo.originalPattern
<< "' did not match in range [" << matchStart << "," << matchEnd << "] "
<< "matched text: '" << sampleText.substr(matchStart, matchEnd - matchStart)
<< "'";
}
return false;
}
return true;
}
// LCOV_EXCL_START Reason: Not in use currently, but kept for future reference
bool WaapHyperscanEngine::Impl::isWordChar(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
}
bool WaapHyperscanEngine::Impl::isNonWordSpecialChar(char c)
{
return c == '?' || c == '<' || c == '>' || c == ':' || c == '=';
}
// LCOV_EXCL_STOP
// WaapHyperscanEngine public interface - delegates to Impl
WaapHyperscanEngine::WaapHyperscanEngine() : pimpl(std::make_unique<Impl>())
{
}
WaapHyperscanEngine::~WaapHyperscanEngine() = default;
bool WaapHyperscanEngine::initialize(const std::shared_ptr<Signatures>& signatures)
{
return pimpl->initialize(signatures);
}
void WaapHyperscanEngine::scanSample(const SampleValue& sample, Waf2ScanResult& res, bool longTextFound,
bool binaryDataFound, bool includeKeywordRegex, bool includePatternRegex) const
{
pimpl->scanSample(sample, res, longTextFound, binaryDataFound, includeKeywordRegex, includePatternRegex);
}
bool WaapHyperscanEngine::isInitialized() const
{
return pimpl->isInitialized();
}
size_t WaapHyperscanEngine::getPatternCount() const
{
return pimpl->getPatternCount();
}
size_t WaapHyperscanEngine::getCompiledPatternCount() const
{
return pimpl->getCompiledPatternCount();
}
size_t WaapHyperscanEngine::getFailedPatternCount() const
{
return pimpl->getFailedPatternCount();
}

View File

@@ -0,0 +1,54 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WAAP_HYPERSCAN_ENGINE_H__
#define __WAAP_HYPERSCAN_ENGINE_H__
#include <string>
#include <vector>
#include <map>
#include <set>
#include <memory>
class Signatures;
class SampleValue;
struct Waf2ScanResult;
class WaapHyperscanEngine {
public:
WaapHyperscanEngine();
~WaapHyperscanEngine();
// Initialize with patterns from Signatures
bool initialize(const std::shared_ptr<Signatures>& signatures);
// Main scanning function - same interface as performStandardRegexChecks
void scanSample(const SampleValue& sample,
Waf2ScanResult& res,
bool longTextFound,
bool binaryDataFound,
bool includeKeywordRegex,
bool includePatternRegex) const;
// Check if the engine is ready to use
bool isInitialized() const;
// Get statistics
size_t getPatternCount() const;
size_t getCompiledPatternCount() const;
size_t getFailedPatternCount() const;
private:
class Impl;
std::unique_ptr<Impl> pimpl;
};
#endif // __WAAP_HYPERSCAN_ENGINE_H__

View File

@@ -0,0 +1,481 @@
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "buffered_compressed_stream.h"
#include "waap.h"
#include "compression_utils.h"
#include <sstream>
USE_DEBUG_FLAG(D_WAAP_SERIALIZE);
using namespace std;
void yieldIfPossible(const string &func, int line)
{
// check mainloop exists and current routine is not the main routine
if (Singleton::exists<I_MainLoop>() &&
Singleton::Consume<I_MainLoop>::by<WaapComponent>()->getCurrentRoutineId().ok())
{
dbgDebug(D_WAAP_SERIALIZE) << "Yielding to main loop from: " << func << ":" << line;
Singleton::Consume<I_MainLoop>::by<WaapComponent>()->yield(false);
}
}
// Static member definitions
const size_t BufferedCompressedOutputStream::CompressedBuffer::BUFFER_SIZE;
const size_t BufferedCompressedInputStream::DecompressedBuffer::OUTPUT_BUFFER_SIZE;
const size_t BufferedCompressedInputStream::DecompressedBuffer::CHUNK_SIZE;
BufferedCompressedOutputStream::BufferedCompressedOutputStream(ostream &underlying_stream)
:
ostream(nullptr),
m_buffer(make_unique<CompressedBuffer>(underlying_stream))
{
rdbuf(m_buffer.get());
}
BufferedCompressedOutputStream::~BufferedCompressedOutputStream()
{
try {
close();
} catch (exception &e) {
// Destructor should not throw
dbgWarning(D_WAAP_SERIALIZE) << "Exception in BufferedCompressedOutputStream destructor: " << e.what();
}
}
void BufferedCompressedOutputStream::flush()
{
if (m_buffer) {
dbgTrace(D_WAAP_SERIALIZE) << "Flushing internal buffer...";
m_buffer->flushBuffer(); // This will compress and encrypt the current buffer with is_last=false
// and flush the underlying stream.
}
// Do NOT call ostream::flush() here, as it would call sync() on our m_buffer,
// which calls compressAndEncryptBuffer(true) and finalizes the GZIP stream prematurely.
// The m_underlying_stream within m_buffer is flushed by compressAndEncryptBuffer itself.
}
void BufferedCompressedOutputStream::close()
{
if (m_buffer) {
dbgTrace(D_WAAP_SERIALIZE) << "Closing stream and flushing buffer...";
m_buffer->flushAndClose();
}
}
BufferedCompressedOutputStream::CompressedBuffer::CompressedBuffer(ostream &underlying_stream)
:
m_underlying_stream(underlying_stream),
m_buffer(),
m_compression_stream(nullptr),
m_closed(false)
{
m_buffer.reserve(BUFFER_SIZE);
m_compression_stream = initCompressionStream();
}
BufferedCompressedOutputStream::CompressedBuffer::~CompressedBuffer()
{
try {
if (!m_closed) {
sync();
}
if (m_compression_stream) {
finiCompressionStream(m_compression_stream);
m_compression_stream = nullptr;
}
} catch (exception &e) {
// Destructor should not throw
dbgWarning(D_WAAP_SERIALIZE) << "Exception in CompressedBuffer destructor: " << e.what();
}
}
void BufferedCompressedOutputStream::CompressedBuffer::flushAndClose()
{
sync();
}
int BufferedCompressedOutputStream::CompressedBuffer::overflow(int c)
{
if (m_closed) {
dbgTrace(D_WAAP_SERIALIZE) << "Stream is closed, returning EOF";
return traits_type::eof();
}
if (c != traits_type::eof()) {
m_buffer.push_back(static_cast<char>(c));
dbgTrace(D_WAAP_SERIALIZE) << "Added char, buffer size now: " << m_buffer.size();
}
if (m_buffer.size() >= BUFFER_SIZE) {
dbgTrace(D_WAAP_SERIALIZE) << "Buffer full, flushing...";
compressAndEncryptBuffer(false);
}
return c;
}
streamsize BufferedCompressedOutputStream::CompressedBuffer::xsputn(const char* s, streamsize n)
{
if (m_closed) {
dbgDebug(D_WAAP_SERIALIZE) << "Stream is closed, returning 0";
return 0;
}
dbgTrace(D_WAAP_SERIALIZE) << "Writing " << n << " bytes";
streamsize written = 0;
while (written < n) {
size_t space_available = BUFFER_SIZE - m_buffer.size();
size_t to_write = min(static_cast<size_t>(n - written), space_available);
m_buffer.insert(m_buffer.end(), s + written, s + written + to_write);
written += to_write;
dbgTrace(D_WAAP_SERIALIZE) << "Wrote " << to_write << " bytes, total written: " << written
<< ", buffer size: " << m_buffer.size();
if (m_buffer.size() >= BUFFER_SIZE) {
dbgTrace(D_WAAP_SERIALIZE) << "Buffer full, flushing...";
compressAndEncryptBuffer(false);
}
}
dbgTrace(D_WAAP_SERIALIZE) << "Completed, total written: " << written;
return written;
}
int BufferedCompressedOutputStream::CompressedBuffer::sync()
{
dbgTrace(D_WAAP_SERIALIZE) << "Called, closed=" << m_closed << ", buffer size=" << m_buffer.size();
if (!m_closed) {
bool success = compressAndEncryptBuffer(true); // Attempt final compression/encryption
// Mark as closed REGARDLESS of the success of the attempt to ensure finalization logic
// for this context isn't re-attempted if this call failed.
m_closed = true;
if (!success) {
dbgWarning(D_WAAP_SERIALIZE) << "Final compression/encryption failed";
return -1;
}
dbgTrace(D_WAAP_SERIALIZE) << "Stream closed successfully";
} else {
dbgDebug(D_WAAP_SERIALIZE) << "Stream already closed, skipping";
}
return 0;
}
void BufferedCompressedOutputStream::CompressedBuffer::flushBuffer()
{
if (m_buffer.empty() || m_closed) {
return;
}
dbgTrace(D_WAAP_SERIALIZE) << "Flushing buffer with " << m_buffer.size() << " bytes";
compressAndEncryptBuffer(false);
}
bool BufferedCompressedOutputStream::CompressedBuffer::compressAndEncryptBuffer(bool is_last)
{
// If the stream is already marked as closed at this buffer's level,
// it means sync() has run, and everything, including encryption, has been finalized.
if (m_closed) {
dbgTrace(D_WAAP_SERIALIZE) << "Stream is already closed, skipping.";
return true;
}
// Skip if there's nothing to compress and this is not the final flush
if (m_buffer.empty() && !is_last) {
dbgTrace(D_WAAP_SERIALIZE) << "Buffer empty and not last call, skipping.";
return true;
}
dbgTrace(D_WAAP_SERIALIZE) << "Compressing and encrypting " << m_buffer.size() << " bytes, is_last: " << is_last;
// Compress the buffer
CompressionResult result = compressData(
m_compression_stream,
CompressionType::GZIP,
static_cast<uint32_t>(m_buffer.size()),
reinterpret_cast<const unsigned char*>(m_buffer.data()),
is_last ? 1 : 0
);
if (!result.ok) {
dbgWarning(D_WAAP_SERIALIZE) << "Failed to compress data";
return false;
}
string compressed_data;
if (result.output && result.num_output_bytes > 0) {
compressed_data = string(reinterpret_cast<const char*>(result.output), result.num_output_bytes);
free(result.output);
}
dbgDebug(D_WAAP_SERIALIZE) << "Compression complete: " << m_buffer.size()
<< " bytes -> " << compressed_data.size() << " bytes";
// Yield after compression to allow other routines to run
YIELD_IF_POSSIBLE();
string final_data = compressed_data;
// Write to underlying stream only if we have data to write
if (!final_data.empty()) {
m_underlying_stream.write(final_data.c_str(), final_data.size());
m_underlying_stream.flush();
}
m_buffer.clear();
// Yield after writing chunk to allow other routines to run
YIELD_IF_POSSIBLE();
return true;
}
BufferedCompressedInputStream::BufferedCompressedInputStream(istream &underlying_stream)
:
istream(nullptr),
m_buffer(make_unique<DecompressedBuffer>(underlying_stream))
{
rdbuf(m_buffer.get());
}
BufferedCompressedInputStream::~BufferedCompressedInputStream()
{
// DecompressedBuffer destructor will handle cleanup
}
BufferedCompressedInputStream::DecompressedBuffer::DecompressedBuffer(istream &underlying_stream)
:
m_underlying_stream(underlying_stream),
m_buffer(),
m_encrypted_buffer(),
m_compressed_buffer(),
m_decompressed_buffer(),
m_decompressed_pos(0),
m_compression_stream(nullptr),
m_eof_reached(false),
m_stream_finished(false)
{
m_buffer.resize(OUTPUT_BUFFER_SIZE);
m_encrypted_buffer.reserve(CHUNK_SIZE);
m_compressed_buffer.reserve(CHUNK_SIZE);
m_decompressed_buffer.reserve(OUTPUT_BUFFER_SIZE);
m_compression_stream = initCompressionStream();
// Set buffer pointers to indicate empty buffer
setg(m_buffer.data(), m_buffer.data(), m_buffer.data());
}
BufferedCompressedInputStream::DecompressedBuffer::~DecompressedBuffer()
{
try {
if (m_compression_stream) {
finiCompressionStream(m_compression_stream);
m_compression_stream = nullptr;
}
} catch (exception &e) {
// Destructor should not throw
dbgWarning(D_WAAP_SERIALIZE) << "Exception in DecompressedBuffer destructor: " << e.what();
}
}
int BufferedCompressedInputStream::DecompressedBuffer::underflow()
{
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
if (m_eof_reached) {
return traits_type::eof();
}
if (!fillBuffer()) {
m_eof_reached = true;
return traits_type::eof();
}
return traits_type::to_int_type(*gptr());
}
streamsize BufferedCompressedInputStream::DecompressedBuffer::xsgetn(char* s, streamsize n)
{
streamsize total_read = 0;
while (total_read < n) {
if (gptr() >= egptr()) {
if (!fillBuffer()) {
m_eof_reached = true;
break;
}
}
streamsize available = egptr() - gptr();
streamsize to_copy = min(n - total_read, available);
memcpy(s + total_read, gptr(), to_copy);
gbump(static_cast<int>(to_copy));
total_read += to_copy;
}
return total_read;
}
bool BufferedCompressedInputStream::DecompressedBuffer::fillBuffer()
{
if (m_eof_reached) {
return false;
}
// If we have remaining data in the decompressed buffer, use it first
if (m_decompressed_pos < m_decompressed_buffer.size()) {
size_t remaining = m_decompressed_buffer.size() - m_decompressed_pos;
size_t to_copy = min(remaining, OUTPUT_BUFFER_SIZE);
memcpy(m_buffer.data(), m_decompressed_buffer.data() + m_decompressed_pos, to_copy);
m_decompressed_pos += to_copy;
// Set up the buffer pointers for streambuf:
// eback() = m_buffer.data() (start of buffer)
// gptr() = m_buffer.data() (current position)
// egptr() = m_buffer.data() + to_copy (end of valid data)
setg(m_buffer.data(), m_buffer.data(), m_buffer.data() + to_copy);
dbgTrace(D_WAAP_SERIALIZE) << "Serving " << to_copy << " bytes from existing decompressed buffer";
// Yield after serving data from buffer to allow other routines to run
YIELD_IF_POSSIBLE();
return true;
}
// Need to process the next chunk
if (!processNextChunk()) {
m_eof_reached = true;
return false;
}
// Now try again with the new data
return fillBuffer();
}
bool BufferedCompressedInputStream::DecompressedBuffer::processNextChunk()
{
while (true) {
if (m_stream_finished) {
return false;
}
// Read a chunk of encrypted data from the underlying stream
if (m_encrypted_buffer.size() < CHUNK_SIZE) {
m_encrypted_buffer.resize(CHUNK_SIZE);
}
m_underlying_stream.read(m_encrypted_buffer.data(), CHUNK_SIZE);
streamsize bytes_read = m_underlying_stream.gcount();
if (bytes_read <= 0) {
m_stream_finished = true;
// End of stream - no more data to process
dbgTrace(D_WAAP_SERIALIZE) << "Reached end of input stream";
return false;
}
m_encrypted_buffer.resize(bytes_read);
dbgTrace(D_WAAP_SERIALIZE) << "Read " << bytes_read << " encrypted bytes from stream";
// Decrypt the chunk
std::vector<char> decrypted_chunk;
if (!decryptChunk(m_encrypted_buffer, decrypted_chunk)) {
dbgWarning(D_WAAP_SERIALIZE) << "Failed to decrypt chunk";
break;
}
// Decompress the chunk
std::vector<char> decompressed_chunk;
if (!decompressChunk(decrypted_chunk, decompressed_chunk)) {
dbgWarning(D_WAAP_SERIALIZE) << "Failed to decompress chunk";
break;
}
if (decompressed_chunk.empty()) {
dbgTrace(D_WAAP_SERIALIZE) << "Decompressed chunk is empty, skipping";
continue; // Nothing to add to the buffer
}
// Replace the decompressed buffer with new data using swap to avoid unnecessary allocations
m_decompressed_buffer.swap(decompressed_chunk);
m_decompressed_pos = 0;
dbgTrace(D_WAAP_SERIALIZE) << "Processed chunk: " << bytes_read
<< " encrypted -> " << decrypted_chunk.size()
<< " compressed -> " << m_decompressed_buffer.size() << " decompressed";
// Yield after processing chunk to allow other routines to run
YIELD_IF_POSSIBLE();
return true;
}
return false;
}
bool BufferedCompressedInputStream::DecompressedBuffer::decryptChunk(
const std::vector<char> &encrypted_chunk,
std::vector<char> &decrypted_chunk)
{
// No encryption - just copy the data
decrypted_chunk = encrypted_chunk;
return true;
}
bool BufferedCompressedInputStream::DecompressedBuffer::decompressChunk(
const std::vector<char> &compressed_chunk,
std::vector<char> &decompressed_chunk)
{
if (compressed_chunk.empty()) {
return true; // Nothing to decompress
}
// Use the streaming decompression
DecompressionResult result = decompressData(
m_compression_stream,
compressed_chunk.size(),
reinterpret_cast<const unsigned char*>(compressed_chunk.data())
);
if (!result.ok) {
dbgWarning(D_WAAP_SERIALIZE) << "Failed to decompress chunk";
return false;
}
if (result.output && result.num_output_bytes > 0) {
decompressed_chunk.assign(
reinterpret_cast<const char*>(result.output),
reinterpret_cast<const char*>(result.output) + result.num_output_bytes
);
free(result.output);
dbgTrace(D_WAAP_SERIALIZE) << "Decompressed chunk: " << compressed_chunk.size()
<< " -> " << decompressed_chunk.size() << " bytes";
// Yield after decompression to allow other routines to run
YIELD_IF_POSSIBLE();
return true;
}
// No output data yet (might need more input for compression algorithm)
decompressed_chunk.clear();
return true;
}