2023-12-04 14:16:00 +02:00

369 lines
13 KiB
C++
Executable File

// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "ParserHTML.h"
#include "Waf2Util.h"
#include "debug.h"
#include <assert.h>
USE_DEBUG_FLAG(D_WAAP_PARSER_HTML);
const std::string ParserHTML::m_parserName = "ParserHTML";
void ParserHTML::onStartElement(
void* ctx,
const xmlChar* localname,
const xmlChar** attributes)
{
ParserHTML* p = (ParserHTML*)ctx;
dbgTrace(D_WAAP_PARSER_HTML) << "HTML OPEN: '" << localname << "'";
p->m_key.push((const char*)localname, xmlStrlen(localname));
if (attributes != NULL) {
int i;
for (i = 0; attributes[i*2]; i++) {
const xmlChar* attr_localname = attributes[i * 2 + 0];
const xmlChar* attr_value = attributes[i * 2 + 1];
if (attr_value == NULL) {
attr_value = (const xmlChar*)"";
}
dbgTrace(D_WAAP_PARSER_HTML)
<< "\tHTML ATTR: elem='"
<< (char *)localname
<< "', "
<< attr_localname
<< "='"
<< std::string((char *)attr_value)
<< "'";
p->m_key.push((const char *)attr_localname, xmlStrlen(attr_localname));
if (p->m_receiver.onKv(
p->m_key.first().c_str(),
p->m_key.first().size(),
(const char *)attr_value,
strlen((const char *)attr_value),
BUFFERED_RECEIVER_F_BOTH,
p->m_parser_depth
) != 0) {
p->m_state = s_error;
}
p->m_key.pop("HTML end attribute");
}
}
// before we add new tracking element to the stack for this new element,
// set "children exists" flag to true for the parent element.
if (!p->m_elemTrackStack.empty()) {
p->m_elemTrackStack.back().hasChildren = true;
}
// when opening new element - start tracking its properties (internal text and existence of subelements)
p->m_elemTrackStack.push_back(ElemTrackInfo());
}
void
ParserHTML::onEndElement(
void* ctx,
const xmlChar* localname)
{
ParserHTML* p = (ParserHTML*)ctx;
dbgTrace(D_WAAP_PARSER_HTML) << "HTML CLOSE: '" << localname << "'";
if (p->m_elemTrackStack.empty()) {
dbgWarning(D_WAAP_PARSER_HTML)
<< "HTML closing tag and elem track stack is empty. This is probably sign of a bug!";
return;
}
ElemTrackInfo& elemTrackInfo = p->m_elemTrackStack.back();
// Usability optimization: only output kv pair for HTML elements that had either sub children
// and/or value within.
// Those "wrapper elements" such as <wrapper><name>john</name><age>21</age></wrapper> only
// contain sub elements. For these we don't emit kv pair.
// However, for truly empty element such as <wrapper></wrapper>, or similar element with
// text: <wrapper>some text</wrapper>, we do output a kv pair.
bool isWrapperElement = elemTrackInfo.hasChildren && (elemTrackInfo.value.size() == 0);
if (!isWrapperElement) {
// Emit tag name as key
if (p->m_receiver.onKey(p->m_key.first().c_str(), p->m_key.first().size()) != 0) {
p->m_state = s_error;
}
if (p->m_receiver.onValue(elemTrackInfo.value.c_str(), elemTrackInfo.value.size()) != 0) {
p->m_state = s_error;
}
if (p->m_receiver.onKvDone() != 0) {
p->m_state = s_error; // error
}
}
// when closing an element - pop its tracking info from the tracking stack
p->m_elemTrackStack.pop_back();
// Also, pop the element's name from m_key stack, so the key name always reflects
// current depth within the elements tree
p->m_key.pop("HTML end element");
}
void
ParserHTML::onCharacters(void *ctx, const xmlChar *ch, int len)
{
ParserHTML *p = (ParserHTML *)ctx;
if (p->m_elemTrackStack.empty()) {
dbgWarning(D_WAAP_PARSER_HTML) << "HTML text and elem track stack is empty. This is probably sign of a bug!";
return;
}
if ((ch == NULL) || (len == 0)) {
dbgTrace(D_WAAP_PARSER_HTML) << "Got empty HTML text element. Ignoring.";
return;
}
ElemTrackInfo& elemTrackInfo = p->m_elemTrackStack.back();
dbgTrace(D_WAAP_PARSER_HTML) << "HTML TEXT: '[" << std::string((char*)ch, (size_t)len) << "]'";
std::string val = std::string((char*)ch, (size_t)len);
// trim isspace() characters around html text chunks.
// The chunks can occur multiple times within one value, when text value is intermixed with html sub-tags.
// for example, for HTML source "<a>sta<b>zzz</b>rt</a>", the "a" tag will include two text
// chunks "sta" and "rt"
// which are concatenated here to form the word "start".
// The trimming is done here to prevent false alarms on detection algorithm that sees
// "\n" characters in the HTML value.
// Example of input that causes false alarm without this trim is (multiline HTML):
// <html><script>\nclean_html_value '\n<\/script><\/html>
Waap::Util::trim(val);
elemTrackInfo.value += val;
}
static void
onError(void *ctx, const char *msg, ...)
{
static const size_t TMP_BUF_SIZE = 4096;
char string[TMP_BUF_SIZE];
va_list arg_ptr;
va_start(arg_ptr, msg);
vsnprintf(string, TMP_BUF_SIZE, msg, arg_ptr);
va_end(arg_ptr);
dbgTrace(D_WAAP_PARSER_HTML) << "LIBXML (html) onError: " << std::string(string);
}
ParserHTML::ParserHTML(IParserStreamReceiver &receiver, size_t parser_depth) :
m_receiver(receiver),
m_state(s_start),
m_bufLen(0),
m_key("html_parser"),
m_pushParserCtxPtr(NULL),
m_parser_depth(parser_depth)
{
dbgTrace(D_WAAP_PARSER_HTML)
<< "ParserHTML::ParserHTML()"
<< "parser_depth="
<< parser_depth;
// TODO:: is zeroing this really needed?
memset(m_buf, 0, sizeof(m_buf));
// Custom sax handler
memset(&m_saxHandler, 0, sizeof(htmlSAXHandler));
m_saxHandler.startElement = onStartElement;
m_saxHandler.endElement = onEndElement;
m_saxHandler.characters = onCharacters;
m_saxHandler.error = onError;
// Register "dummy" tag to receive any text
m_elemTrackStack.push_back(ElemTrackInfo());
// Ugly: push first element into key (it will be ignored since we will never call
// the "first()" method of this key within HTML parser object.
m_key.push("html", 4);
}
ParserHTML::~ParserHTML()
{
// Cleanup HTML
dbgTrace(D_WAAP_PARSER_HTML) << "ParserHTML::~ParserHTML()";
if (m_pushParserCtxPtr) {
htmlFreeParserCtxt(m_pushParserCtxPtr);
}
}
bool
ParserHTML::filterErrors(const xmlError *xmlError)
{
dbgDebug(D_WAAP_PARSER_HTML)
<< "ParserHTML::filterErrors(): xmlError "
<< xmlError->code
<< ": '"
<< xmlError->message
<< "'";
// Ignore specific error: "HTML declaration allowed only at the start of the document".
// This includes the case of "multiple HTML declarations" we've seen sent by some SOAP clients.
// The HTML is still parsed because the parser is put into permissive mode with the HTML_PARSE_RECOVER flag,
// but even though it recovers and parses the HTML correctly, the error code is still reported here.
// Ignoring this error prevents the WAAP code from thinking the HTML is "broken" and from scanning the HTML
// source as-is, in effect preventing false alarm on that HTML source.
if (xmlError->code == XML_ERR_RESERVED_XML_NAME || xmlError->code == XML_ERR_UNDECLARED_ENTITY) {
dbgDebug(D_WAAP_PARSER_HTML)
<< "ParserHTML::filterErrors(): ignoring the '"
<< xmlError->code
<< ": "
<< xmlError->message
<< "' html parser error.";
return false;
}
return true;
}
size_t
ParserHTML::push(const char *data, size_t data_len)
{
size_t i = 0;
char c;
if (data_len == 0) {
dbgTrace(D_WAAP_PARSER_HTML) << "ParserHTML::push(): end of data signal! m_state=" << m_state;
// Send zero-length chunk with "terminate" flag enabled to signify end-of-stream
if (htmlParseChunk(m_pushParserCtxPtr, m_buf, 0, 1)) {
auto xmlError = xmlCtxtGetLastError(m_pushParserCtxPtr);
if (xmlError && filterErrors(xmlError)) {
dbgDebug(D_WAAP_PARSER_HTML)
<< "ParserHTML::push(): xmlError: code="
<< xmlError->code
<< ": '"
<< xmlError->message
<< "'";
m_state = s_error; // error
return -1;
}
}
return m_bufLen;
}
int expected_buffer_len = FIRST_BUFFER_SIZE - 1;
while (i < data_len) {
c = data[i];
switch (m_state) {
case s_start:
dbgTrace(D_WAAP_PARSER_HTML) << "ParserHTML::push(): s_start";
m_state = s_accumulate_first_bytes;
// fall through //
CP_FALL_THROUGH;
case s_accumulate_first_bytes:
dbgTrace(D_WAAP_PARSER_HTML)
<< "ParserHTML::push(): s_accumulate_first_bytes. c='"
<< data[i]
<< "'; m_bufLen="
<< m_bufLen
<< "; i="
<< i;
m_buf[m_bufLen] = c;
m_bufLen++;
if (c == '?') {
expected_buffer_len = FIRST_BUFFER_SIZE;
}
if (m_bufLen == expected_buffer_len) {
m_state = s_start_parsing;
}
break;
case s_start_parsing:
dbgTrace(D_WAAP_PARSER_HTML)
<< "ParserHTML::push(): s_start_parsing. sending len="
<< m_bufLen
<< ": '"
<< std::string(m_buf, m_bufLen)
<< "'; i="
<< i;
// Create HTML SAX (push parser) context
// It is important to buffer at least first 4 bytes of input stream so libxml can determine text
// encoding!
m_pushParserCtxPtr =
htmlCreatePushParserCtxt(&m_saxHandler, this, m_buf, m_bufLen, NULL, XML_CHAR_ENCODING_UTF8);
// Enable "permissive mode" for HTML SAX parser.
// In this mode, the libxml parser doesn't stop on errors, but still reports them!
htmlCtxtUseOptions(m_pushParserCtxPtr, HTML_PARSE_RECOVER);
m_state = s_parsing;
// fall through //
CP_FALL_THROUGH;
case s_parsing:
dbgTrace(D_WAAP_PARSER_HTML)
<< "ParserHTML::push(): s_parsing. sending len="
<< (int)(data_len - i)
<< ": '"
<< std::string(data + i, data_len - i)
<< "'; i="
<< i;
if (m_pushParserCtxPtr) {
if (htmlParseChunk(m_pushParserCtxPtr, data + i, data_len - i, 0)) {
auto xmlError = xmlCtxtGetLastError(m_pushParserCtxPtr);
if (xmlError && filterErrors(xmlError)) {
dbgDebug(D_WAAP_PARSER_HTML)
<< "ParserHTML::push(): xmlError: code="
<< xmlError->code
<< ": '"
<< xmlError->message
<< "'";
m_state = s_error; // error
return 0;
}
}
// success (whole buffer consumed)
i = data_len - 1; // take into account ++i at the end of the state machine loop
}
break;
case s_error:
dbgTrace(D_WAAP_PARSER_HTML) << "ParserHTML::push(): s_error";
return 0;
}
++i;
}
dbgTrace(D_WAAP_PARSER_HTML) << "ParserHTML::push(): exiting with param(len)=" << data_len << ": i=" << i;
return i;
}
void
ParserHTML::finish()
{
push(NULL, 0);
}
const std::string &
ParserHTML::name() const
{
return m_parserName;
}
bool
ParserHTML::error() const
{
return m_state == s_error;
}