// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "ParserHTML.h" #include "Waf2Util.h" #include "debug.h" #include USE_DEBUG_FLAG(D_WAAP_PARSER_HTML); const std::string ParserHTML::m_parserName = "ParserHTML"; void ParserHTML::onStartElement( void* ctx, const xmlChar* localname, const xmlChar** attributes) { ParserHTML* p = (ParserHTML*)ctx; dbgTrace(D_WAAP_PARSER_HTML) << "HTML OPEN: '" << localname << "'"; p->m_key.push((const char*)localname, xmlStrlen(localname)); if (attributes != NULL) { int i; for (i = 0; attributes[i*2]; i++) { const xmlChar* attr_localname = attributes[i * 2 + 0]; const xmlChar* attr_value = attributes[i * 2 + 1]; if (attr_value == NULL) { attr_value = (const xmlChar*)""; } dbgTrace(D_WAAP_PARSER_HTML) << "\tHTML ATTR: elem='" << (char *)localname << "', " << attr_localname << "='" << std::string((char *)attr_value) << "'"; p->m_key.push((const char *)attr_localname, xmlStrlen(attr_localname)); if (p->m_receiver.onKv( p->m_key.first().c_str(), p->m_key.first().size(), (const char *)attr_value, strlen((const char *)attr_value), BUFFERED_RECEIVER_F_BOTH, p->m_parser_depth ) != 0) { p->m_state = s_error; } p->m_key.pop("HTML end attribute"); } } // before we add new tracking element to the stack for this new element, // set "children exists" flag to true for the parent element. if (!p->m_elemTrackStack.empty()) { p->m_elemTrackStack.back().hasChildren = true; } // when opening new element - start tracking its properties (internal text and existence of subelements) p->m_elemTrackStack.push_back(ElemTrackInfo()); } void ParserHTML::onEndElement( void* ctx, const xmlChar* localname) { ParserHTML* p = (ParserHTML*)ctx; dbgTrace(D_WAAP_PARSER_HTML) << "HTML CLOSE: '" << localname << "'"; if (p->m_elemTrackStack.empty()) { dbgWarning(D_WAAP_PARSER_HTML) << "HTML closing tag and elem track stack is empty. This is probably sign of a bug!"; return; } ElemTrackInfo& elemTrackInfo = p->m_elemTrackStack.back(); // Usability optimization: only output kv pair for HTML elements that had either sub children // and/or value within. // Those "wrapper elements" such as john21 only // contain sub elements. For these we don't emit kv pair. // However, for truly empty element such as , or similar element with // text: some text, we do output a kv pair. bool isWrapperElement = elemTrackInfo.hasChildren && (elemTrackInfo.value.size() == 0); if (!isWrapperElement) { // Emit tag name as key if (p->m_receiver.onKey(p->m_key.first().c_str(), p->m_key.first().size()) != 0) { p->m_state = s_error; } if (p->m_receiver.onValue(elemTrackInfo.value.c_str(), elemTrackInfo.value.size()) != 0) { p->m_state = s_error; } if (p->m_receiver.onKvDone() != 0) { p->m_state = s_error; // error } } // when closing an element - pop its tracking info from the tracking stack p->m_elemTrackStack.pop_back(); // Also, pop the element's name from m_key stack, so the key name always reflects // current depth within the elements tree p->m_key.pop("HTML end element"); } void ParserHTML::onCharacters(void *ctx, const xmlChar *ch, int len) { ParserHTML *p = (ParserHTML *)ctx; if (p->m_elemTrackStack.empty()) { dbgWarning(D_WAAP_PARSER_HTML) << "HTML text and elem track stack is empty. This is probably sign of a bug!"; return; } if ((ch == NULL) || (len == 0)) { dbgTrace(D_WAAP_PARSER_HTML) << "Got empty HTML text element. Ignoring."; return; } ElemTrackInfo& elemTrackInfo = p->m_elemTrackStack.back(); dbgTrace(D_WAAP_PARSER_HTML) << "HTML TEXT: '[" << std::string((char*)ch, (size_t)len) << "]'"; std::string val = std::string((char*)ch, (size_t)len); // trim isspace() characters around html text chunks. // The chunks can occur multiple times within one value, when text value is intermixed with html sub-tags. // for example, for HTML source "stazzzrt", the "a" tag will include two text // chunks "sta" and "rt" // which are concatenated here to form the word "start". // The trimming is done here to prevent false alarms on detection algorithm that sees // "\n" characters in the HTML value. // Example of input that causes false alarm without this trim is (multiline HTML): //