Adds support to the ValidateUtf8Encoding operator

2025-11-22 03:56:05 +03:00 · 2015-08-10 14:51:27 -03:00
parent 9096055ea7
commit f231df16ad
3 changed files with 179 additions and 15 deletions
--- a/src/operators/validate_utf8_encoding.cc
+++ b/src/operators/validate_utf8_encoding.cc
@@ -22,21 +22,169 @@
 namespace ModSecurity {
 namespace operators {

-bool ValidateUtf8Encoding::evaluate(Assay *assay) {
-    /**
-     * @todo Implement the operator ValidateUtf8Encoding.
-     *       Reference: https://github.com/SpiderLabs/ModSecurity/wiki/Reference-Manual#validateUtf8Encoding
-     */
-    return true;
+int ValidateUtf8Encoding::detect_utf8_character(
+    const unsigned char *p_read, unsigned int length) {
+    int unicode_len = 0;
+    unsigned int d = 0;
+    unsigned char c;
+
+    if (p_read == NULL) {
+        return UNICODE_ERROR_DECODING_ERROR;
+    }
+    c = *p_read;
+
+    /* If first byte begins with binary 0 it is single byte encoding */
+    if ((c & 0x80) == 0) {
+        /* single byte unicode (7 bit ASCII equivilent) has no validation */
+        return 1;
+    } else if ((c & 0xE0) == 0xC0) {
+        /* If first byte begins with binary 110 it is two byte encoding*/
+        /* check we have at least two bytes */
+        if (length < 2) {
+            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
+            /* check second byte starts with binary 10 */
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else {
+            unicode_len = 2;
+            /* compute character number */
+            d = ((c & 0x1F) << 6) | (*(p_read + 1) & 0x3F);
+        }
+    } else if ((c & 0xF0) == 0xE0) {
+        /* If first byte begins with binary 1110 it is three byte encoding */
+        /* check we have at least three bytes */
+        if (length < 3) {
+            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
+            /* check second byte starts with binary 10 */
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else if (((*(p_read + 2)) & 0xC0) != 0x80) {
+            /* check third byte starts with binary 10 */
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else {
+            unicode_len = 3;
+            /* compute character number */
+            d = ((c & 0x0F) << 12) | ((*(p_read + 1) & 0x3F) << 6)
+                | (*(p_read + 2) & 0x3F);
+        }
+    } else if ((c & 0xF8) == 0xF0) {
+        /* If first byte begins with binary 11110 it is four byte encoding */
+        /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
+        if (c >= 0xF5) {
+            return UNICODE_ERROR_RESTRICTED_CHARACTER;
+        }
+        /* check we have at least four bytes */
+        if (length < 4) {
+            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else if (((*(p_read + 2)) & 0xC0) != 0x80) {
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else if (((*(p_read + 3)) & 0xC0) != 0x80) {
+            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        } else {
+            unicode_len = 4;
+            /* compute character number */
+            d = ((c & 0x07) << 18) | ((*(p_read + 1) & 0x3F) << 12)
+                | ((*(p_read + 2) & 0x3F) < 6) | (*(p_read + 3) & 0x3F);
+        }
+    } else {
+        /* any other first byte is invalid (RFC 3629) */
+        return UNICODE_ERROR_INVALID_ENCODING;
+    }
+
+    /* invalid UTF-8 character number range (RFC 3629) */
+    if ((d >= 0xD800) && (d <= 0xDFFF)) {
+        return UNICODE_ERROR_RESTRICTED_CHARACTER;
+    }
+
+    /* check for overlong */
+    if ((unicode_len == 4) && (d < 0x010000)) {
+        /* four byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
+    } else if ((unicode_len == 3) && (d < 0x0800)) {
+        /* three byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
+    } else if ((unicode_len == 2) && (d < 0x80)) {
+        /* two byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
+    }
+
+    return unicode_len;
 }

+bool ValidateUtf8Encoding::evaluate(Assay *assay, const std::string &str) {
+    unsigned int i, bytes_left;

-ValidateUtf8Encoding::ValidateUtf8Encoding(std::string op, std::string param,
-    bool negation)
-    : Operator() {
-    this->op = op;
-    this->param = param;
+    const char *str_c = str.c_str();
+    bytes_left = str.size();
+
+    for (i = 0; i < str.size();) {
+        int rc = detect_utf8_character((unsigned char *)&str_c[i], bytes_left);
+
+        switch (rc) {
+            case UNICODE_ERROR_CHARACTERS_MISSING :
+                if (assay) {
+                    assay->debug(8, "Invalid UTF-8 encoding: "
+                        "not enough bytes in character "
+                        "at " + str + ". [offset \"" +
+                        std::to_string(i) + "\"]");
+                }
+                return true;
+                break;
+            case UNICODE_ERROR_INVALID_ENCODING :
+                if (assay) {
+                    assay->debug(8, "Invalid UTF-8 encoding: "
+                        "invalid byte value in character "
+                        "at " + str + ". [offset \"" +
+                        std::to_string(i) + "\"]");
+                }
+                return true;
+                break;
+            case UNICODE_ERROR_OVERLONG_CHARACTER :
+                if (assay) {
+                    assay->debug(8, "Invalid UTF-8 encoding: "
+                        "overlong character detected "
+                        "at " + str + ". [offset \"" +
+                        std::to_string(i) + "\"]");
+                }
+                return true;
+                break;
+            case UNICODE_ERROR_RESTRICTED_CHARACTER :
+                if (assay) {
+                    assay->debug(8, "Invalid UTF-8 encoding: "
+                        "use of restricted character "
+                        "at " + str + ". [offset \"" +
+                        std::to_string(i) + "\"]");
+                }
+                return true;
+                break;
+            case UNICODE_ERROR_DECODING_ERROR :
+                if (assay) {
+                    assay->debug(8, "Error validating UTF-8 decoding "
+                        "at " + str + ". [offset \"" +
+                        std::to_string(i) + "\"]");
+                }
+                return true;
+                break;
+        }
+
+        if (rc <= 0) {
+            if (assay) {
+                assay->debug(8, "Internal error during UTF-8 validation "
+                    "at " + str + ". [offset \"" +
+                    std::to_string(i) + "\"]");
+            }
+            return true;
+        }
+
+        i += rc;
+        bytes_left -= rc;
+    }
+
+    return false;
 }

+
 }  // namespace operators
 }  // namespace ModSecurity
--- a/src/operators/validate_utf8_encoding.h
+++ b/src/operators/validate_utf8_encoding.h
@@ -20,7 +20,13 @@

 #include "operators/operator.h"

-#ifdef __cplusplus
+
+#define UNICODE_ERROR_CHARACTERS_MISSING    -1
+#define UNICODE_ERROR_INVALID_ENCODING      -2
+#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
+#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
+#define UNICODE_ERROR_DECODING_ERROR        -5
+

 namespace ModSecurity {
 namespace operators {
@@ -28,14 +34,18 @@ namespace operators {
 class ValidateUtf8Encoding : public Operator {
 public:
    /** @ingroup ModSecurity_Operator */
-    ValidateUtf8Encoding(std::string o, std::string p, bool i);
-    bool evaluate(Assay *assay);
+    ValidateUtf8Encoding(std::string op, std::string param, bool negation)
+        : Operator(op, param, negation) { }
+
+    bool evaluate(Assay *assay, const std::string &input) override;
+
+    int detect_utf8_character(const unsigned char *p_read,
+        unsigned int length);
 };

 }  // namespace operators
 }  // namespace ModSecurity

-#endif


 #endif  // SRC_OPERATORS_VALIDATE_UTF8_ENCODING_H_
--- a/test/unit/unit_test.cc
+++ b/test/unit/unit_test.cc
@@ -94,6 +94,12 @@ UnitTest *UnitTest::from_yajl_node(yajl_val &node) {
        } else if (strcmp(key, "input") == 0) {
           u->input = YAJL_GET_STRING(val);
           replaceAll(&(u->input), "\\0", '\0');
+           replaceAll(&(u->input), "\\xe4", '\xe4');
+           replaceAll(&(u->input), "\\x03", '\x03');
+           replaceAll(&(u->input), "\\xbf", '\xbf');
+           replaceAll(&(u->input), "\\xc9", '\xc9');
+           replaceAll(&(u->input), "\\x3b", '\x3b');
+           replaceAll(&(u->input), "\\xFF", '\xff');
        } else if (strcmp(key, "name") == 0) {
           u->name = YAJL_GET_STRING(val);
        } else if (strcmp(key, "type") == 0) {