Added utf8toUnicode tfn

2026-01-16 08:27:10 +03:00 · 2012-08-24 17:02:13 +00:00
parent 499819e232
commit 781a506bc9
5 changed files with 255 additions and 6 deletions
--- a/apache2/msc_util.c
+++ b/apache2/msc_util.c
@@ -74,6 +74,233 @@ static unsigned char *c2x(unsigned what, unsigned char *where);
 static unsigned char x2c(unsigned char *what);
 static unsigned char xsingle2c(unsigned char *what);

+/** \brief Validate IPv4 Netmask
+ *
+ * \param mp Pointer to memory pool
+ * \param input Pointer to input data
+ * \param input_len Input data length
+ * \param changed Set if data is changed
+ *
+ * \retval rval On Success
+ */
+char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) {
+    int unicode_len = 0, length = 0;
+    unsigned int d = 0, count = 0;
+    unsigned char c, *utf;
+    char *rval, *data;
+    unsigned int i, len, j;
+    unsigned int bytes_left = input_len;
+    unsigned char *unicode = NULL;
+
+    *changed = 0;
+
+    len = input_len * 7 + 1;
+    data = rval = apr_palloc(mp, len);
+    if (rval == NULL) return NULL;
+
+
+    if (input == NULL) return NULL;
+
+    for(i = 0; i < bytes_left;)  {
+        unicode_len = 0; d = 0;
+        utf = (unsigned char *)&input[i];
+
+        c = *utf;
+
+        /* If first byte begins with binary 0 it is single byte encoding */
+        if ((c & 0x80) == 0) {
+            /* single byte unicode (7 bit ASCII equivilent) has no validation */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        /* If first byte begins with binary 110 it is two byte encoding*/
+        else if ((c & 0xE0) == 0xC0) {
+            /* check we have at least two bytes */
+            if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 2;
+                count+=6;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+                }
+            }
+        }
+        /* If first byte begins with binary 1110 it is three byte encoding */
+        else if ((c & 0xF0) == 0xE0) {
+            /* check we have at least three bytes */
+            if (bytes_left < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check third byte starts with binary 10 */
+            else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 3;
+                count+=6;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x0F) << 12) | ((*(utf + 1) & 0x3F) << 6) | (*(utf + 2) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+
+                }
+            }
+        }
+        /* If first byte begins with binary 11110 it is four byte encoding */
+        else if ((c & 0xF8) == 0xF0) {
+            /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
+            if (c >= 0xF5) {
+                *data++ = c;
+            }
+            /* check we have at least four bytes */
+            if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check third byte starts with binary 10 */
+            else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check forth byte starts with binary 10 */
+            else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 4;
+                count+=7;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x07) << 18) | ((*(utf + 1) & 0x3F) << 12) | ((*(utf + 2) & 0x3F) < 6) | (*(utf + 3) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+
+                }
+            }
+        }
+        /* any other first byte is invalid (RFC 3629) */
+        else {
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        /* invalid UTF-8 character number range (RFC 3629) */
+        if ((d >= 0xD800) && (d <= 0xDFFF)) {
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        /* check for overlong */
+        if ((unicode_len == 4) && (d < 0x010000)) {
+            /* four byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        else if ((unicode_len == 3) && (d < 0x0800)) {
+            /* three byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        else if ((unicode_len == 2) && (d < 0x80)) {
+            /* two byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        if(unicode_len > 0) {
+            i += unicode_len;
+        } else {
+            i++;
+        }
+    }
+
+    *data ='\0';
+
+    return rval;
+}
+
 /** \brief Validate IPv4 Netmask
 *
 * \param ip_strv6 Pointer to ipv6 address