Perform Utf8ToUnicode transformation in-place

- Removed inplace helper function from the class, as it's only referenced by the implementation.
2025-12-31 05:39:10 +03:00 · 2024-08-19 10:15:12 -07:00
parent 17a2cbd164
commit 2c3c228725
2 changed files with 30 additions and 66 deletions
--- a/src/actions/transformations/utf8_to_unicode.cc
+++ b/src/actions/transformations/utf8_to_unicode.cc
@@ -20,67 +20,33 @@
 #include "src/utils/string.h"


+constexpr int UNICODE_ERROR_CHARACTERS_MISSING   = -1;
+constexpr int UNICODE_ERROR_INVALID_ENCODING     = -2;
+
+
 namespace modsecurity::actions::transformations {


-bool Utf8ToUnicode::transform(std::string &value, const Transaction *trans) const {
-    std::string ret;
-    unsigned char *input;
-    int _changed = 0;
-    char *out;
+static inline bool encode(std::string &value) {
+    auto input = reinterpret_cast<unsigned char*>(value.data());
+    const auto input_len = value.length();

-    input = reinterpret_cast<unsigned char *>
-        (malloc(sizeof(char) * value.length()+1));
-
-    if (input == NULL) {
-        return "";
-    }
-
-    memcpy(input, value.c_str(), value.length()+1);
-
-    out = inplace(input, value.size() + 1, &_changed);
-    free(input);
-    if (out != NULL) {
-        ret.assign(reinterpret_cast<char *>(out),
-            strlen(reinterpret_cast<char *>(out)));
-        free(out);
-    }
-
-    const auto changed = ret != value;
-    value = ret;
-    return changed;
-}
-
-
-char *Utf8ToUnicode::inplace(unsigned char *input,
-    uint64_t input_len, int *changed) {
-    unsigned int count = 0;
-    char *data;
-    char *data_orig;
-    unsigned int i, len, j;
-    unsigned int bytes_left = input_len;
+    bool changed = false;
+    std::string::size_type count = 0;
+    auto bytes_left = input_len;
    unsigned char unicode[8];
-    *changed = 0;

    /* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */
    /* Max size per character should fit in 4 bytes */
-    len = input_len * 4 + 1;
-    data = reinterpret_cast<char *>(malloc(sizeof(char) * len));
-    if (data == NULL) {
-        return NULL;
-    }
-    data_orig = data;
+    const auto len = input_len * 4 + 1;
+    std::string ret(len, {});
+    auto data = ret.data();

-    if (input == NULL) {
-        free(data);
-        return NULL;
-    }
-
-    for (i = 0; i < bytes_left;)  {
+    for (std::string::size_type i = 0; i < bytes_left;)  {
        int unicode_len = 0;
        unsigned int d = 0;
        unsigned char c;
-        unsigned char *utf = (unsigned char *)&input[i];
+        auto utf = &input[i];

        c = *utf;

@@ -108,7 +74,7 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                unicode_len = UNICODE_ERROR_INVALID_ENCODING;
            } else {
                unicode_len = 2;
-                count+=6;
+                count += 6;
                if (count <= len) {
                    int length = 0;
                    /* compute character number */
@@ -138,11 +104,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                            break;
                    }

-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                        *data++ = unicode[j];
                    }

-                    *changed = 1;
+                    changed = true;
                }
            }
        } else if ((c & 0xF0) == 0xE0) {
@@ -190,11 +156,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                            break;
                    }

-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                        *data++ = unicode[j];
                    }

-                    *changed = 1;
+                    changed = true;
                }
            }
        } else if ((c & 0xF8) == 0xF0) {
@@ -252,11 +218,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                            break;
                    }

-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                        *data++ = unicode[j];
                    }

-                    *changed = 1;
+                    changed = true;
                }
            }
        } else {
@@ -300,7 +266,14 @@ char *Utf8ToUnicode::inplace(unsigned char *input,

    *data ='\0';

-    return data_orig;
+    ret.resize(data - ret.c_str());
+    std::swap(value, ret);
+    return changed;
+}
+
+
+bool Utf8ToUnicode::transform(std::string &value, const Transaction *trans) const {
+    return encode(value);
 }


--- a/src/actions/transformations/utf8_to_unicode.h
+++ b/src/actions/transformations/utf8_to_unicode.h
@@ -18,12 +18,6 @@

 #include "transformation.h"

-#define UNICODE_ERROR_CHARACTERS_MISSING    -1
-#define UNICODE_ERROR_INVALID_ENCODING      -2
-#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
-#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
-#define UNICODE_ERROR_DECODING_ERROR        -5
-
 namespace modsecurity::actions::transformations {

 class Utf8ToUnicode : public Transformation {
@@ -32,9 +26,6 @@ class Utf8ToUnicode : public Transformation {
        : Transformation(action) { }

    bool transform(std::string &value, const Transaction *trans) const override;
-
-    static char *inplace(unsigned char *input, uint64_t input_len,
-        int *changed);
 };

 }  // namespace modsecurity::actions::transformations