diff --git a/src/operators/validate_utf8_encoding.cc b/src/operators/validate_utf8_encoding.cc index f55e231f..f6781b81 100644 --- a/src/operators/validate_utf8_encoding.cc +++ b/src/operators/validate_utf8_encoding.cc @@ -22,21 +22,169 @@ namespace ModSecurity { namespace operators { -bool ValidateUtf8Encoding::evaluate(Assay *assay) { - /** - * @todo Implement the operator ValidateUtf8Encoding. - * Reference: https://github.com/SpiderLabs/ModSecurity/wiki/Reference-Manual#validateUtf8Encoding - */ - return true; +int ValidateUtf8Encoding::detect_utf8_character( + const unsigned char *p_read, unsigned int length) { + int unicode_len = 0; + unsigned int d = 0; + unsigned char c; + + if (p_read == NULL) { + return UNICODE_ERROR_DECODING_ERROR; + } + c = *p_read; + + /* If first byte begins with binary 0 it is single byte encoding */ + if ((c & 0x80) == 0) { + /* single byte unicode (7 bit ASCII equivilent) has no validation */ + return 1; + } else if ((c & 0xE0) == 0xC0) { + /* If first byte begins with binary 110 it is two byte encoding*/ + /* check we have at least two bytes */ + if (length < 2) { + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(p_read + 1)) & 0xC0) != 0x80) { + /* check second byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 2; + /* compute character number */ + d = ((c & 0x1F) << 6) | (*(p_read + 1) & 0x3F); + } + } else if ((c & 0xF0) == 0xE0) { + /* If first byte begins with binary 1110 it is three byte encoding */ + /* check we have at least three bytes */ + if (length < 3) { + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(p_read + 1)) & 0xC0) != 0x80) { + /* check second byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(p_read + 2)) & 0xC0) != 0x80) { + /* check third byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 3; + /* compute character number */ + d = ((c & 0x0F) << 12) | ((*(p_read + 1) & 0x3F) << 6) + | (*(p_read + 2) & 0x3F); + } + } else if ((c & 0xF8) == 0xF0) { + /* If first byte begins with binary 11110 it is four byte encoding */ + /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/ + if (c >= 0xF5) { + return UNICODE_ERROR_RESTRICTED_CHARACTER; + } + /* check we have at least four bytes */ + if (length < 4) { + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(p_read + 1)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(p_read + 2)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(p_read + 3)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 4; + /* compute character number */ + d = ((c & 0x07) << 18) | ((*(p_read + 1) & 0x3F) << 12) + | ((*(p_read + 2) & 0x3F) < 6) | (*(p_read + 3) & 0x3F); + } + } else { + /* any other first byte is invalid (RFC 3629) */ + return UNICODE_ERROR_INVALID_ENCODING; + } + + /* invalid UTF-8 character number range (RFC 3629) */ + if ((d >= 0xD800) && (d <= 0xDFFF)) { + return UNICODE_ERROR_RESTRICTED_CHARACTER; + } + + /* check for overlong */ + if ((unicode_len == 4) && (d < 0x010000)) { + /* four byte could be represented with less bytes */ + return UNICODE_ERROR_OVERLONG_CHARACTER; + } else if ((unicode_len == 3) && (d < 0x0800)) { + /* three byte could be represented with less bytes */ + return UNICODE_ERROR_OVERLONG_CHARACTER; + } else if ((unicode_len == 2) && (d < 0x80)) { + /* two byte could be represented with less bytes */ + return UNICODE_ERROR_OVERLONG_CHARACTER; + } + + return unicode_len; } +bool ValidateUtf8Encoding::evaluate(Assay *assay, const std::string &str) { + unsigned int i, bytes_left; -ValidateUtf8Encoding::ValidateUtf8Encoding(std::string op, std::string param, - bool negation) - : Operator() { - this->op = op; - this->param = param; + const char *str_c = str.c_str(); + bytes_left = str.size(); + + for (i = 0; i < str.size();) { + int rc = detect_utf8_character((unsigned char *)&str_c[i], bytes_left); + + switch (rc) { + case UNICODE_ERROR_CHARACTERS_MISSING : + if (assay) { + assay->debug(8, "Invalid UTF-8 encoding: " + "not enough bytes in character " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + break; + case UNICODE_ERROR_INVALID_ENCODING : + if (assay) { + assay->debug(8, "Invalid UTF-8 encoding: " + "invalid byte value in character " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + break; + case UNICODE_ERROR_OVERLONG_CHARACTER : + if (assay) { + assay->debug(8, "Invalid UTF-8 encoding: " + "overlong character detected " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + break; + case UNICODE_ERROR_RESTRICTED_CHARACTER : + if (assay) { + assay->debug(8, "Invalid UTF-8 encoding: " + "use of restricted character " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + break; + case UNICODE_ERROR_DECODING_ERROR : + if (assay) { + assay->debug(8, "Error validating UTF-8 decoding " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + break; + } + + if (rc <= 0) { + if (assay) { + assay->debug(8, "Internal error during UTF-8 validation " + "at " + str + ". [offset \"" + + std::to_string(i) + "\"]"); + } + return true; + } + + i += rc; + bytes_left -= rc; + } + + return false; } + } // namespace operators } // namespace ModSecurity diff --git a/src/operators/validate_utf8_encoding.h b/src/operators/validate_utf8_encoding.h index 924f98c5..95019a88 100644 --- a/src/operators/validate_utf8_encoding.h +++ b/src/operators/validate_utf8_encoding.h @@ -20,7 +20,13 @@ #include "operators/operator.h" -#ifdef __cplusplus + +#define UNICODE_ERROR_CHARACTERS_MISSING -1 +#define UNICODE_ERROR_INVALID_ENCODING -2 +#define UNICODE_ERROR_OVERLONG_CHARACTER -3 +#define UNICODE_ERROR_RESTRICTED_CHARACTER -4 +#define UNICODE_ERROR_DECODING_ERROR -5 + namespace ModSecurity { namespace operators { @@ -28,14 +34,18 @@ namespace operators { class ValidateUtf8Encoding : public Operator { public: /** @ingroup ModSecurity_Operator */ - ValidateUtf8Encoding(std::string o, std::string p, bool i); - bool evaluate(Assay *assay); + ValidateUtf8Encoding(std::string op, std::string param, bool negation) + : Operator(op, param, negation) { } + + bool evaluate(Assay *assay, const std::string &input) override; + + int detect_utf8_character(const unsigned char *p_read, + unsigned int length); }; } // namespace operators } // namespace ModSecurity -#endif #endif // SRC_OPERATORS_VALIDATE_UTF8_ENCODING_H_ diff --git a/test/unit/unit_test.cc b/test/unit/unit_test.cc index b9bcd294..011aca5c 100644 --- a/test/unit/unit_test.cc +++ b/test/unit/unit_test.cc @@ -94,6 +94,12 @@ UnitTest *UnitTest::from_yajl_node(yajl_val &node) { } else if (strcmp(key, "input") == 0) { u->input = YAJL_GET_STRING(val); replaceAll(&(u->input), "\\0", '\0'); + replaceAll(&(u->input), "\\xe4", '\xe4'); + replaceAll(&(u->input), "\\x03", '\x03'); + replaceAll(&(u->input), "\\xbf", '\xbf'); + replaceAll(&(u->input), "\\xc9", '\xc9'); + replaceAll(&(u->input), "\\x3b", '\x3b'); + replaceAll(&(u->input), "\\xFF", '\xff'); } else if (strcmp(key, "name") == 0) { u->name = YAJL_GET_STRING(val); } else if (strcmp(key, "type") == 0) {