From 2b056485d088e0badc3cde91a952fae12c07bfd5 Mon Sep 17 00:00:00 2001 From: Felipe Zimmerle Date: Wed, 25 May 2016 18:19:36 -0300 Subject: [PATCH] Adds support to Utf8ToUnicode transformation Issue #974 --- src/actions/transformations/transformation.cc | 2 +- .../transformations/utf8_to_unicode.cc | 275 +++++++++++++++++- src/actions/transformations/utf8_to_unicode.h | 16 +- test/test-cases/secrules-language-tests | 2 +- 4 files changed, 275 insertions(+), 20 deletions(-) diff --git a/src/actions/transformations/transformation.cc b/src/actions/transformations/transformation.cc index 67602b91..761cbd54 100644 --- a/src/actions/transformations/transformation.cc +++ b/src/actions/transformations/transformation.cc @@ -113,7 +113,7 @@ Transformation* Transformation::instantiate(std::string a) { IF_MATCH(urlDecode) { return new UrlDecode(a); } IF_MATCH(urlDecodeUni) { return new UrlDecodeUni(a); } IF_MATCH(urlEncode) { return new UrlEncode(a); } - IF_MATCH(utf8_to_unicode) { return new Utf8Unicode(a); } + IF_MATCH(utf8ToUnicode) { return new Utf8ToUnicode(a); } return new Transformation(a); } diff --git a/src/actions/transformations/utf8_to_unicode.cc b/src/actions/transformations/utf8_to_unicode.cc index 669078bf..a98d3ad7 100644 --- a/src/actions/transformations/utf8_to_unicode.cc +++ b/src/actions/transformations/utf8_to_unicode.cc @@ -24,31 +24,278 @@ #include "modsecurity/transaction.h" #include "actions/transformations/transformation.h" +#include "src/utils.h" namespace modsecurity { namespace actions { namespace transformations { -Utf8Unicode::Utf8Unicode(std::string action) - : Transformation(action) { - this->action_kind = 1; + +std::string Utf8ToUnicode::evaluate(std::string value, + Transaction *transaction) { + std::string ret; + unsigned char *input = NULL; + int changed = 0; + + input = reinterpret_cast + (malloc(sizeof(char) * value.length()+1)); + + if (input == NULL) { + return ""; + } + + inplace(input, value.size() + 1, &changed); + + memcpy(input, value.c_str(), value.length()+1); + + ret.assign(reinterpret_cast(input), 10); + free(input); + + return ret; } -std::string Utf8Unicode::evaluate(std::string value, - Transaction *transaction) { - /** - * @todo Implement the transformation Utf8Unicode - */ - if (transaction) { -#ifndef NO_LOGS - transaction->debug(4, "Transformation Utf8Unicode is " \ - "not implemented yet."); -#endif + +char *Utf8ToUnicode::inplace(unsigned char *input, + uint64_t input_len, int *changed) { + int unicode_len = 0, length = 0; + unsigned int d = 0, count = 0; + unsigned char c, *utf; + char *rval, *data; + unsigned int i, len, j; + unsigned int bytes_left = input_len; + unsigned char unicode[8]; + *changed = 0; + + len = input_len * 7 + 1; + data = reinterpret_cast(malloc(sizeof(char) * len)); + if (data == NULL) { + return NULL; } - return value; + + if (input == NULL) { + return NULL; + } + + for (i = 0; i < bytes_left;) { + unicode_len = 0; d = 0; + utf = (unsigned char *)&input[i]; + + c = *utf; + + /* If first byte begins with binary 0 it is single byte encoding */ + if ((c & 0x80) == 0) { + /* single byte unicode (7 bit ASCII equivilent) has no validation */ + count++; + if (count <= len) { + if (c == 0) + *data = x2c(&c); + else + *data++ = c; + } + } else if ((c & 0xE0) == 0xC0) { + /* If first byte begins with binary 110 it is two byte encoding*/ + /* check we have at least two bytes */ + if (bytes_left < 2) { + /* check second byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(utf + 1)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 2; + count+=6; + if (count <= len) { + /* compute character number */ + d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F); + *data++ = '%'; + *data++ = 'u'; + snprintf(reinterpret_cast(unicode), + sizeof(reinterpret_cast(unicode)), + "%x", d); + length = strlen(reinterpret_cast(unicode)); + + switch (length) { + case 1: + *data++ = '0'; + *data++ = '0'; + *data++ = '0'; + break; + case 2: + *data++ = '0'; + *data++ = '0'; + break; + case 3: + *data++ = '0'; + break; + case 4: + case 5: + break; + } + + for (j = 0; j < length; j++) { + *data++ = unicode[j]; + } + + *changed = 1; + } + } + } else if ((c & 0xF0) == 0xE0) { + /* If first byte begins with binary 1110 it is three byte encoding */ + /* check we have at least three bytes */ + if (bytes_left < 3) { + /* check second byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(utf + 1)) & 0xC0) != 0x80) { + /* check third byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(utf + 2)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 3; + count+=6; + if (count <= len) { + /* compute character number */ + d = ((c & 0x0F) << 12) + | ((*(utf + 1) & 0x3F) << 6) + | (*(utf + 2) & 0x3F); + *data++ = '%'; + *data++ = 'u'; + snprintf(reinterpret_cast(unicode), + sizeof(reinterpret_cast(unicode)), + "%x", d); + length = strlen(reinterpret_cast(unicode)); + + switch (length) { + case 1: + *data++ = '0'; + *data++ = '0'; + *data++ = '0'; + break; + case 2: + *data++ = '0'; + *data++ = '0'; + break; + case 3: + *data++ = '0'; + break; + case 4: + case 5: + break; + } + + for (j = 0; j < length; j++) { + *data++ = unicode[j]; + } + + *changed = 1; + } + } + } else if ((c & 0xF8) == 0xF0) { + /* If first byte begins with binary 11110 it + * is four byte encoding + */ + /* restrict characters to UTF-8 range (U+0000 - U+10FFFF) */ + if (c >= 0xF5) { + *data++ = c; + } + /* check we have at least four bytes */ + if (bytes_left < 4) { + /* check second byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + } else if (((*(utf + 1)) & 0xC0) != 0x80) { + /* check third byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(utf + 2)) & 0xC0) != 0x80) { + /* check forth byte starts with binary 10 */ + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else if (((*(utf + 3)) & 0xC0) != 0x80) { + unicode_len = UNICODE_ERROR_INVALID_ENCODING; + } else { + unicode_len = 4; + count+=7; + if (count <= len) { + /* compute character number */ + d = ((c & 0x07) << 18) + | ((*(utf + 1) & 0x3F) << 12) + | ((*(utf + 2) & 0x3F) << 6) + | (*(utf + 3) & 0x3F); + *data++ = '%'; + *data++ = 'u'; + snprintf(reinterpret_cast(unicode), + sizeof(reinterpret_cast(unicode)), + "%x", d); + length = strlen(reinterpret_cast(unicode)); + + switch (length) { + case 1: + *data++ = '0'; + *data++ = '0'; + *data++ = '0'; + break; + case 2: + *data++ = '0'; + *data++ = '0'; + break; + case 3: + *data++ = '0'; + break; + case 4: + case 5: + break; + } + + for (j = 0; j < length; j++) { + *data++ = unicode[j]; + } + + *changed = 1; + } + } + } else { + /* any other first byte is invalid (RFC 3629) */ + count++; + if (count <= len) + *data++ = c; + } + + /* invalid UTF-8 character number range (RFC 3629) */ + if ((d >= 0xD800) && (d <= 0xDFFF)) { + count++; + if (count <= len) + *data++ = c; + } + + /* check for overlong */ + if ((unicode_len == 4) && (d < 0x010000)) { + /* four byte could be represented with less bytes */ + count++; + if (count <= len) + *data++ = c; + } else if ((unicode_len == 3) && (d < 0x0800)) { + /* three byte could be represented with less bytes */ + count++; + if (count <= len) + *data++ = c; + } else if ((unicode_len == 2) && (d < 0x80)) { + /* two byte could be represented with less bytes */ + count++; + if (count <= len) + *data++ = c; + } + + if (unicode_len > 0) { + i += unicode_len; + } else { + i++; + } + } + + *data ='\0'; + + return NULL; } + } // namespace transformations } // namespace actions } // namespace modsecurity diff --git a/src/actions/transformations/utf8_to_unicode.h b/src/actions/transformations/utf8_to_unicode.h index c49bcb5b..564eb0c9 100644 --- a/src/actions/transformations/utf8_to_unicode.h +++ b/src/actions/transformations/utf8_to_unicode.h @@ -21,24 +21,32 @@ #ifndef SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_ #define SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_ -#ifdef __cplusplus +#define UNICODE_ERROR_CHARACTERS_MISSING -1 +#define UNICODE_ERROR_INVALID_ENCODING -2 +#define UNICODE_ERROR_OVERLONG_CHARACTER -3 +#define UNICODE_ERROR_RESTRICTED_CHARACTER -4 +#define UNICODE_ERROR_DECODING_ERROR -5 + namespace modsecurity { class Transaction; namespace actions { namespace transformations { -class Utf8Unicode : public Transformation { +class Utf8ToUnicode : public Transformation { public: - explicit Utf8Unicode(std::string action); + explicit Utf8ToUnicode(std::string action) : Transformation(action) { } + std::string evaluate(std::string exp, Transaction *transaction) override; + + static char *inplace(unsigned char *input, uint64_t input_len, + int *changed); }; } // namespace transformations } // namespace actions } // namespace modsecurity -#endif #endif // SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_ diff --git a/test/test-cases/secrules-language-tests b/test/test-cases/secrules-language-tests index 8089e2ef..ca597a23 160000 --- a/test/test-cases/secrules-language-tests +++ b/test/test-cases/secrules-language-tests @@ -1 +1 @@ -Subproject commit 8089e2ef12e8572c7d9feb8463e608b20de928ba +Subproject commit ca597a23aa6eca03757c612543802fcdb884f135