2023-12-04 14:16:00 +02:00

86 lines
2.9 KiB
C++
Executable File

// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "ParserBase.h"
#include "KeyStack.h"
#include <libxml/xmlstring.h>
#include <libxml/xmlerror.h>
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#define FIRST_BUFFER_SIZE 5 // must buffer at least 4 first bytes to allow unicode autodetection (BOM).
class ParserHTML : public ParserBase {
public:
ParserHTML(IParserStreamReceiver &receiver, size_t parser_depth);
virtual ~ParserHTML();
size_t push(const char *data, size_t data_len);
void finish();
virtual const std::string &name() const;
bool error() const;
virtual size_t depth() { return (m_key.depth() > 0) ? m_key.depth()-1 : m_key.depth(); }
private:
enum state {
s_start,
s_accumulate_first_bytes,
s_start_parsing,
s_parsing,
s_error
};
// Information tracked per each element in current stack of tracked HTML elements
struct ElemTrackInfo {
std::string value;
bool hasChildren;
ElemTrackInfo():hasChildren(false) {
// when element is just opened - we still didn't see any children,
// hence start with the "hasChildren" flag as false.
// This flag will be enabled once we meet opening of the a subelement.
// Also, we start from empty value string and gradually append to it each
// time we receive next piece of text from HTML parser.
// The collected value is then emitted when element finishes.
}
};
static void onStartElement(
void *ctx,
const xmlChar *localname,
const xmlChar **attributes);
static void onEndElement(
void* ctx,
const xmlChar* localname);
static void onCharacters(
void *ctx,
const xmlChar *ch,
int len);
// Filter out errors that should be ignored. Returns true if error should be treated,
// false if an error should be ignored
bool filterErrors(const xmlError *xmlError);
IParserStreamReceiver &m_receiver;
enum state m_state;
// buffer first few bytes of stream (required before calling SAX parser for the first time)
char m_buf[FIRST_BUFFER_SIZE];
int m_bufLen;
KeyStack m_key;
std::vector<ElemTrackInfo> m_elemTrackStack;
htmlSAXHandler m_saxHandler;
htmlParserCtxtPtr m_pushParserCtxPtr;
static const std::string m_parserName;
size_t m_parser_depth;
};