ModSecurity/src/operators/validate_utf8_encoding.cc

/*
 * ModSecurity, http://www.modsecurity.org/
 * Copyright (c) 2015 - 2021 Trustwave Holdings, Inc. (http://www.trustwave.com/)
 *
 * You may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * If any of the files related to licensing are missing or if you have any
 * other questions related to licensing please contact Trustwave Holdings, Inc.
 * directly using the email address security@modsecurity.org.
 *
 */

#include "src/operators/validate_utf8_encoding.h"

#include <string>

#include "src/operators/operator.h"


constexpr int UNICODE_ERROR_CHARACTERS_MISSING   = -1;
constexpr int UNICODE_ERROR_INVALID_ENCODING     = -2;
constexpr int UNICODE_ERROR_OVERLONG_CHARACTER   = -3;
constexpr int UNICODE_ERROR_RESTRICTED_CHARACTER = -4;
constexpr int UNICODE_ERROR_DECODING_ERROR       = -5;


namespace modsecurity {
namespace operators {

int ValidateUtf8Encoding::detect_utf8_character(
    const unsigned char *p_read, unsigned int length) {
    int unicode_len = 0;
    unsigned int d = 0;
    unsigned char c;

    if (p_read == NULL) {
        return UNICODE_ERROR_DECODING_ERROR;
    }
    c = *p_read;

    /* If first byte begins with binary 0 it is single byte encoding */
    if ((c & 0x80) == 0) {
        /* single byte unicode (7 bit ASCII equivilent) has no validation */
        return 1;
    } else if ((c & 0xE0) == 0xC0) {
        /* If first byte begins with binary 110 it is two byte encoding*/
        /* check we have at least two bytes */
        if (length < 2) {
            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
            /* check second byte starts with binary 10 */
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else {
            unicode_len = 2;
            /* compute character number */
            d = ((c & 0x1F) << 6) | (*(p_read + 1) & 0x3F);
        }
    } else if ((c & 0xF0) == 0xE0) {
        /* If first byte begins with binary 1110 it is three byte encoding */
        /* check we have at least three bytes */
        if (length < 3) {
            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
            /* check second byte starts with binary 10 */
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else if (((*(p_read + 2)) & 0xC0) != 0x80) {
            /* check third byte starts with binary 10 */
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else {
            unicode_len = 3;
            /* compute character number */
            d = ((c & 0x0F) << 12) | ((*(p_read + 1) & 0x3F) << 6)
                | (*(p_read + 2) & 0x3F);
        }
    } else if ((c & 0xF8) == 0xF0) {
        /* If first byte begins with binary 11110 it is four byte encoding */
        /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
        if (c >= 0xF5) {
            return UNICODE_ERROR_RESTRICTED_CHARACTER;
        }
        /* check we have at least four bytes */
        if (length < 4) {
            unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
        } else if (((*(p_read + 1)) & 0xC0) != 0x80) {
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else if (((*(p_read + 2)) & 0xC0) != 0x80) {
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else if (((*(p_read + 3)) & 0xC0) != 0x80) {
            unicode_len = UNICODE_ERROR_INVALID_ENCODING;
        } else {
            unicode_len = 4;
            /* compute character number */
            d = ((c & 0x07) << 18) | ((*(p_read + 1) & 0x3F) << 12)
                | ((*(p_read + 2) & 0x3F) << 6) | (*(p_read + 3) & 0x3F);
        }
    } else {
        /* any other first byte is invalid (RFC 3629) */
        return UNICODE_ERROR_INVALID_ENCODING;
    }

    /* invalid UTF-8 character number range (RFC 3629) */
    if ((d >= 0xD800) && (d <= 0xDFFF)) {
        return UNICODE_ERROR_RESTRICTED_CHARACTER;
    }

    /* check for overlong */
    if ((unicode_len == 4) && (d < 0x010000)) {
        /* four byte could be represented with less bytes */
        return UNICODE_ERROR_OVERLONG_CHARACTER;
    } else if ((unicode_len == 3) && (d < 0x0800)) {
        /* three byte could be represented with less bytes */
        return UNICODE_ERROR_OVERLONG_CHARACTER;
    } else if ((unicode_len == 2) && (d < 0x80)) {
        /* two byte could be represented with less bytes */
        return UNICODE_ERROR_OVERLONG_CHARACTER;
    }

    return unicode_len;
}

bool ValidateUtf8Encoding::evaluate(Transaction *transaction, RuleWithActions *rule,
    const std::string &str, RuleMessage &ruleMessage) {
    unsigned int i, bytes_left;

    const char *str_c = str.c_str();
    bytes_left = str.size();

    for (i = 0; i < str.size();) {
        int rc = detect_utf8_character((unsigned char *)&str_c[i], bytes_left);

        switch (rc) {
            case UNICODE_ERROR_CHARACTERS_MISSING :
                if (transaction) {
                    ms_dbg_a(transaction, 8, "Invalid UTF-8 encoding: "
                        "not enough bytes in character "
                        "at " + str + ". [offset \"" +
                        std::to_string(i) + "\"]");
                }
                return true;
            case UNICODE_ERROR_INVALID_ENCODING :
                if (transaction) {
                    ms_dbg_a(transaction, 8, "Invalid UTF-8 encoding: "
                        "invalid byte value in character "
                        "at " + str + ". [offset \"" +
                        std::to_string(i) + "\"]");
                    logOffset(ruleMessage, i, str.size());
                }
                return true;
            case UNICODE_ERROR_OVERLONG_CHARACTER :
                if (transaction) {
                    ms_dbg_a(transaction, 8, "Invalid UTF-8 encoding: "
                        "overlong character detected "
                        "at " + str + ". [offset \"" +
                        std::to_string(i) + "\"]");
                    logOffset(ruleMessage, i, str.size());
                }
                return true;
            case UNICODE_ERROR_RESTRICTED_CHARACTER :
                if (transaction) {
                    ms_dbg_a(transaction, 8, "Invalid UTF-8 encoding: "
                        "use of restricted character "
                        "at " + str + ". [offset \"" +
                        std::to_string(i) + "\"]");
                    logOffset(ruleMessage, i, str.size());
                }
                return true;
            case UNICODE_ERROR_DECODING_ERROR :
                if (transaction) {
                    ms_dbg_a(transaction, 8, "Error validating UTF-8 decoding "
                        "at " + str + ". [offset \"" +
                        std::to_string(i) + "\"]");
                    logOffset(ruleMessage, i, str.size());
                }
                return true;
        }

        if (rc <= 0) {
            if (transaction) {
                ms_dbg_a(transaction, 8, "Internal error during UTF-8 validation "
                    "at " + str + ". [offset \"" +
                    std::to_string(i) + "\"]");
                logOffset(ruleMessage, i, str.size());
            }
            return true;
        }

        i += rc;
        bytes_left -= rc;
    }

    return false;
}


}  // namespace operators
}  // namespace modsecurity