From 2b056485d088e0badc3cde91a952fae12c07bfd5 Mon Sep 17 00:00:00 2001
From: Felipe Zimmerle <felipe@zimmerle.org>
Date: Wed, 25 May 2016 18:19:36 -0300
Subject: [PATCH] Adds support to Utf8ToUnicode transformation

Issue #974
---
 src/actions/transformations/transformation.cc |   2 +-
 .../transformations/utf8_to_unicode.cc        | 275 +++++++++++++++++-
 src/actions/transformations/utf8_to_unicode.h |  16 +-
 test/test-cases/secrules-language-tests       |   2 +-
 4 files changed, 275 insertions(+), 20 deletions(-)
diff --git a/src/actions/transformations/transformation.cc b/src/actions/transformations/transformation.cc
index 67602b91..761cbd54 100644
--- a/src/actions/transformations/transformation.cc
+++ b/src/actions/transformations/transformation.cc
@@ -113,7 +113,7 @@ Transformation* Transformation::instantiate(std::string a) {
     IF_MATCH(urlDecode) { return new UrlDecode(a); }
     IF_MATCH(urlDecodeUni) { return new UrlDecodeUni(a); }
     IF_MATCH(urlEncode) { return new UrlEncode(a); }
-    IF_MATCH(utf8_to_unicode) { return new Utf8Unicode(a); }
+    IF_MATCH(utf8ToUnicode) { return new Utf8ToUnicode(a); }
 
     return new Transformation(a);
 }
diff --git a/src/actions/transformations/utf8_to_unicode.cc b/src/actions/transformations/utf8_to_unicode.cc
index 669078bf..a98d3ad7 100644
--- a/src/actions/transformations/utf8_to_unicode.cc
+++ b/src/actions/transformations/utf8_to_unicode.cc
@@ -24,31 +24,278 @@
 
 #include "modsecurity/transaction.h"
 #include "actions/transformations/transformation.h"
+#include "src/utils.h"
 
 
 namespace modsecurity {
 namespace actions {
 namespace transformations {
 
-Utf8Unicode::Utf8Unicode(std::string action)
-    : Transformation(action) {
-    this->action_kind = 1;
+
+std::string Utf8ToUnicode::evaluate(std::string value,
+    Transaction *transaction) {
+    std::string ret;
+    unsigned char *input = NULL;
+    int changed = 0;
+
+    input = reinterpret_cast<unsigned char *>
+        (malloc(sizeof(char) * value.length()+1));
+
+    if (input == NULL) {
+        return "";
+    }
+
+    inplace(input, value.size() + 1, &changed);
+
+    memcpy(input, value.c_str(), value.length()+1);
+
+    ret.assign(reinterpret_cast<char *>(input), 10);
+    free(input);
+
+    return ret;
 }
 
-std::string Utf8Unicode::evaluate(std::string value,
-    Transaction *transaction) {
-    /**
-     * @todo Implement the transformation Utf8Unicode
-     */
-    if (transaction) {
-#ifndef NO_LOGS
-        transaction->debug(4, "Transformation Utf8Unicode is " \
-            "not implemented yet.");
-#endif
+
+char *Utf8ToUnicode::inplace(unsigned char *input,
+    uint64_t input_len, int *changed) {
+    int unicode_len = 0, length = 0;
+    unsigned int d = 0, count = 0;
+    unsigned char c, *utf;
+    char *rval, *data;
+    unsigned int i, len, j;
+    unsigned int bytes_left = input_len;
+    unsigned char unicode[8];
+    *changed = 0;
+
+    len = input_len * 7 + 1;
+    data = reinterpret_cast<char *>(malloc(sizeof(char) * len));
+    if (data == NULL) {
+        return NULL;
     }
-    return value;
+
+    if (input == NULL) {
+        return NULL;
+    }
+
+    for (i = 0; i < bytes_left;)  {
+        unicode_len = 0; d = 0;
+        utf = (unsigned char *)&input[i];
+
+        c = *utf;
+
+        /* If first byte begins with binary 0 it is single byte encoding */
+        if ((c & 0x80) == 0) {
+            /* single byte unicode (7 bit ASCII equivilent) has no validation */
+            count++;
+            if (count <= len) {
+                if (c == 0)
+                    *data = x2c(&c);
+                else
+                    *data++ = c;
+            }
+        } else if ((c & 0xE0) == 0xC0) {
+            /* If first byte begins with binary 110 it is two byte encoding*/
+            /* check we have at least two bytes */
+            if (bytes_left < 2) {
+                /* check second byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            } else if (((*(utf + 1)) & 0xC0) != 0x80) {
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else {
+                unicode_len = 2;
+                count+=6;
+                if (count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    snprintf(reinterpret_cast<char *>(unicode),
+                             sizeof(reinterpret_cast<char *>(unicode)),
+                             "%x", d);
+                    length = strlen(reinterpret_cast<char *>(unicode));
+
+                    switch (length) {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for (j = 0; j < length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+                }
+            }
+        } else if ((c & 0xF0) == 0xE0) {
+        /* If first byte begins with binary 1110 it is three byte encoding */
+            /* check we have at least three bytes */
+            if (bytes_left < 3) {
+                /* check second byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            } else if (((*(utf + 1)) & 0xC0) != 0x80) {
+                /* check third byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else if (((*(utf + 2)) & 0xC0) != 0x80) {
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else {
+                unicode_len = 3;
+                count+=6;
+                if (count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x0F) << 12)
+                        | ((*(utf + 1) & 0x3F) << 6)
+                        | (*(utf + 2) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    snprintf(reinterpret_cast<char *>(unicode),
+                             sizeof(reinterpret_cast<char *>(unicode)),
+                             "%x", d);
+                    length = strlen(reinterpret_cast<char *>(unicode));
+
+                    switch (length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for (j = 0; j < length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+                }
+            }
+        } else if ((c & 0xF8) == 0xF0) {
+            /* If first byte begins with binary 11110 it
+             * is four byte encoding
+             */
+            /* restrict characters to UTF-8 range (U+0000 - U+10FFFF) */
+            if (c >= 0xF5) {
+                *data++ = c;
+            }
+            /* check we have at least four bytes */
+            if (bytes_left < 4) {
+                /* check second byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            } else if (((*(utf + 1)) & 0xC0) != 0x80) {
+                /* check third byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else if (((*(utf + 2)) & 0xC0) != 0x80) {
+                /* check forth byte starts with binary 10 */
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else if (((*(utf + 3)) & 0xC0) != 0x80) {
+                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            } else {
+                unicode_len = 4;
+                count+=7;
+                if (count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x07) << 18)
+                        | ((*(utf + 1) & 0x3F) << 12)
+                        | ((*(utf + 2) & 0x3F) << 6)
+                        | (*(utf + 3) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    snprintf(reinterpret_cast<char *>(unicode),
+                             sizeof(reinterpret_cast<char *>(unicode)),
+                             "%x", d);
+                    length = strlen(reinterpret_cast<char *>(unicode));
+
+                    switch (length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for (j = 0; j < length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+                }
+            }
+        } else {
+            /* any other first byte is invalid (RFC 3629) */
+            count++;
+            if (count <= len)
+                *data++ = c;
+        }
+
+        /* invalid UTF-8 character number range (RFC 3629) */
+        if ((d >= 0xD800) && (d <= 0xDFFF)) {
+            count++;
+            if (count <= len)
+                *data++ = c;
+        }
+
+        /* check for overlong */
+        if ((unicode_len == 4) && (d < 0x010000)) {
+            /* four byte could be represented with less bytes */
+            count++;
+            if (count <= len)
+                *data++ = c;
+        } else if ((unicode_len == 3) && (d < 0x0800)) {
+            /* three byte could be represented with less bytes */
+            count++;
+            if (count <= len)
+                *data++ = c;
+        } else if ((unicode_len == 2) && (d < 0x80)) {
+            /* two byte could be represented with less bytes */
+            count++;
+            if (count <= len)
+                *data++ = c;
+        }
+
+        if (unicode_len > 0) {
+            i += unicode_len;
+        } else {
+            i++;
+        }
+    }
+
+    *data ='\0';
+
+    return NULL;
 }
 
+
 }  // namespace transformations
 }  // namespace actions
 }  // namespace modsecurity
diff --git a/src/actions/transformations/utf8_to_unicode.h b/src/actions/transformations/utf8_to_unicode.h
index c49bcb5b..564eb0c9 100644
--- a/src/actions/transformations/utf8_to_unicode.h
+++ b/src/actions/transformations/utf8_to_unicode.h
@@ -21,24 +21,32 @@
 #ifndef SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_
 #define SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_
 
-#ifdef __cplusplus
+#define UNICODE_ERROR_CHARACTERS_MISSING    -1
+#define UNICODE_ERROR_INVALID_ENCODING      -2
+#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
+#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
+#define UNICODE_ERROR_DECODING_ERROR        -5
+
 namespace modsecurity {
 class Transaction;
 
 namespace actions {
 namespace transformations {
 
-class Utf8Unicode : public Transformation {
+class Utf8ToUnicode : public Transformation {
  public:
-    explicit Utf8Unicode(std::string action);
+    explicit Utf8ToUnicode(std::string action) : Transformation(action) { }
+
     std::string evaluate(std::string exp,
         Transaction *transaction) override;
+
+    static char *inplace(unsigned char *input, uint64_t input_len,
+        int *changed);
 };
 
 }  // namespace transformations
 }  // namespace actions
 }  // namespace modsecurity
 
-#endif
 
 #endif  // SRC_ACTIONS_TRANSFORMATIONS_UTF8_TO_UNICODE_H_
diff --git a/test/test-cases/secrules-language-tests b/test/test-cases/secrules-language-tests
index 8089e2ef..ca597a23 160000
--- a/test/test-cases/secrules-language-tests
+++ b/test/test-cases/secrules-language-tests
@@ -1 +1 @@
-Subproject commit 8089e2ef12e8572c7d9feb8463e608b20de928ba
+Subproject commit ca597a23aa6eca03757c612543802fcdb884f135