// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "ParserBase.h" #include "KeyStack.h" #include #include #include #include #define FIRST_BUFFER_SIZE 5 // must buffer at least 4 first bytes to allow unicode autodetection (BOM). class ParserHTML : public ParserBase { public: ParserHTML(IParserStreamReceiver &receiver); virtual ~ParserHTML(); size_t push(const char *data, size_t data_len); void finish(); virtual const std::string &name() const; bool error() const; virtual size_t depth() { return (m_key.depth() > 0) ? m_key.depth()-1 : m_key.depth(); } private: enum state { s_start, s_accumulate_first_bytes, s_start_parsing, s_parsing, s_error }; // Information tracked per each element in current stack of tracked HTML elements struct ElemTrackInfo { std::string value; bool hasChildren; ElemTrackInfo():hasChildren(false) { // when element is just opened - we still didn't see any children, // hence start with the "hasChildren" flag as false. // This flag will be enabled once we meet opening of the a subelement. // Also, we start from empty value string and gradually append to it each // time we receive next piece of text from HTML parser. // The collected value is then emitted when element finishes. } }; static void onStartElement( void *ctx, const xmlChar *localname, const xmlChar **attributes); static void onEndElement( void* ctx, const xmlChar* localname); static void onCharacters( void *ctx, const xmlChar *ch, int len); // Filter out errors that should be ignored. Returns true if error should be treated, // false if an error should be ignored bool filterErrors(xmlErrorPtr xmlError); IParserStreamReceiver &m_receiver; enum state m_state; // buffer first few bytes of stream (required before calling SAX parser for the first time) char m_buf[FIRST_BUFFER_SIZE]; int m_bufLen; KeyStack m_key; std::vector m_elemTrackStack; htmlSAXHandler m_saxHandler; htmlParserCtxtPtr m_pushParserCtxPtr; static const std::string m_parserName; };