mirror of
https://github.com/owasp-modsecurity/ModSecurity.git
synced 2025-08-16 07:56:12 +03:00
Incorrect utf8toUnicode transformation for 00xx
Fix issue and restructure handling
This commit is contained in:
parent
1121ef0bed
commit
907d61ad6d
@ -105,82 +105,44 @@ int swap_int32(int x) {
|
|||||||
*/
|
*/
|
||||||
char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) {
|
char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) {
|
||||||
int unicode_len = 0, length = 0;
|
int unicode_len = 0, length = 0;
|
||||||
unsigned int d = 0, count = 0;
|
unsigned int d = 0;
|
||||||
unsigned char c, *utf;
|
unsigned char c, *utf;
|
||||||
char *rval, *data;
|
char *rval, *data;
|
||||||
unsigned int i, len, j;
|
unsigned int i, len, j;
|
||||||
unsigned int bytes_left = input_len;
|
unsigned int bytes_left = input_len;
|
||||||
unsigned char *unicode = NULL;
|
unsigned char *unicode = NULL;
|
||||||
|
|
||||||
|
if (input == NULL) return NULL;
|
||||||
|
|
||||||
*changed = 0;
|
*changed = 0;
|
||||||
/* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */
|
/* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */
|
||||||
/* Max size per character should fit in 4 bytes */
|
/* Max size per character should fit in 4 bytes (%u01020304) */
|
||||||
len = input_len * 4 + 1;
|
len = input_len * 10 + 1;
|
||||||
data = rval = apr_palloc(mp, len);
|
data = rval = apr_palloc(mp, len);
|
||||||
if (rval == NULL) return NULL;
|
if (rval == NULL) return NULL;
|
||||||
|
|
||||||
|
|
||||||
if (input == NULL) return NULL;
|
|
||||||
|
|
||||||
for (i = 0; i < bytes_left;) {
|
for (i = 0; i < bytes_left;) {
|
||||||
unicode_len = 0; d = 0;
|
unicode_len = 0; d = 0;
|
||||||
utf = (unsigned char *)&input[i];
|
utf = (unsigned char *)&input[i];
|
||||||
|
|
||||||
c = *utf;
|
c = *utf;
|
||||||
|
|
||||||
/* If first byte begins with binary 0 it is single byte encoding */
|
/* If first byte begins with binary 0 it may be single byte encoding */
|
||||||
if ((c & 0x80) == 0) {
|
if ((c & 0x80) == 0) {
|
||||||
/* single byte unicode (7 bit ASCII equivilent) has no validation */
|
if (c == 0) {
|
||||||
count++;
|
unicode_len = 2;
|
||||||
if(count <= len) {
|
d = utf[1];
|
||||||
if(c == 0)
|
|
||||||
*data = x2c(&c);
|
|
||||||
else
|
|
||||||
*data++ = c;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
/* If first byte begins with binary 110 it is two byte encoding*/
|
/* If first byte begins with binary 110 it is two byte encoding*/
|
||||||
else if ((c & 0xE0) == 0xC0) {
|
else if ((c & 0xE0) == 0xC0) {
|
||||||
/* check we have at least two bytes */
|
/* check we have at least two bytes */
|
||||||
if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||||
/* check second byte starts with binary 10 */
|
/* check second byte starts with binary 10 */
|
||||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if ((utf[1] & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
else {
|
else {
|
||||||
unicode_len = 2;
|
unicode_len = 2;
|
||||||
count+=6;
|
|
||||||
if(count <= len) {
|
|
||||||
/* compute character number */
|
/* compute character number */
|
||||||
d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F);
|
d = ((c & 0x1F) << 6) | (utf[1] & 0x3F);
|
||||||
*data++ = '%';
|
|
||||||
*data++ = 'u';
|
|
||||||
unicode = apr_psprintf(mp, "%x", d);
|
|
||||||
length = strlen(unicode);
|
|
||||||
|
|
||||||
switch(length) {
|
|
||||||
case 1:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(j=0; j<length; j++) {
|
|
||||||
*data++ = unicode[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
*changed = 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* If first byte begins with binary 1110 it is three byte encoding */
|
/* If first byte begins with binary 1110 it is three byte encoding */
|
||||||
@ -188,142 +150,56 @@ char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int inp
|
|||||||
/* check we have at least three bytes */
|
/* check we have at least three bytes */
|
||||||
if (bytes_left < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
if (bytes_left < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||||
/* check second byte starts with binary 10 */
|
/* check second byte starts with binary 10 */
|
||||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if ((utf[1] & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
/* check third byte starts with binary 10 */
|
/* check third byte starts with binary 10 */
|
||||||
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
else {
|
else {
|
||||||
unicode_len = 3;
|
unicode_len = 3;
|
||||||
count+=6;
|
|
||||||
if(count <= len) {
|
|
||||||
/* compute character number */
|
/* compute character number */
|
||||||
d = ((c & 0x0F) << 12) | ((*(utf + 1) & 0x3F) << 6) | (*(utf + 2) & 0x3F);
|
d = ((c & 0x0F) << 12) | ((utf[1] & 0x3F) << 6) | (*(utf + 2) & 0x3F);
|
||||||
*data++ = '%';
|
|
||||||
*data++ = 'u';
|
|
||||||
unicode = apr_psprintf(mp, "%x", d);
|
|
||||||
length = strlen(unicode);
|
|
||||||
|
|
||||||
switch(length) {
|
|
||||||
case 1:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(j=0; j<length; j++) {
|
|
||||||
*data++ = unicode[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
*changed = 1;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* If first byte begins with binary 11110 it is four byte encoding */
|
/* If first byte begins with binary 11110 it is four byte encoding */
|
||||||
else if ((c & 0xF8) == 0xF0) {
|
else if ((c & 0xF8) == 0xF0) {
|
||||||
/* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
|
/* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
|
||||||
if (c >= 0xF5) {
|
if (c >= 0xF5) unicode_len = UNICODE_ERROR_RESTRICTED_CHARACTER;
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
/* check we have at least four bytes */
|
/* check we have at least four bytes */
|
||||||
if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
else if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||||
/* check second byte starts with binary 10 */
|
/* check second byte starts with binary 10 */
|
||||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if ((utf[1] & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
/* check third byte starts with binary 10 */
|
/* check third byte starts with binary 10 */
|
||||||
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
/* check forth byte starts with binary 10 */
|
/* check forth byte starts with binary 10 */
|
||||||
else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||||
else {
|
else {
|
||||||
unicode_len = 4;
|
unicode_len = 4;
|
||||||
count+=7;
|
|
||||||
if(count <= len) {
|
|
||||||
/* compute character number */
|
/* compute character number */
|
||||||
d = ((c & 0x07) << 18) | ((*(utf + 1) & 0x3F) << 12) | ((*(utf + 2) & 0x3F) << 6) | (*(utf + 3) & 0x3F);
|
d = ((c & 0x07) << 18) | ((utf[1] & 0x3F) << 12) | ((*(utf + 2) & 0x3F) << 6) | (*(utf + 3) & 0x3F);
|
||||||
*data++ = '%';
|
|
||||||
*data++ = 'u';
|
|
||||||
unicode = apr_psprintf(mp, "%x", d);
|
|
||||||
length = strlen(unicode);
|
|
||||||
|
|
||||||
switch(length) {
|
|
||||||
case 1:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
*data++ = '0';
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
*data++ = '0';
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(j=0; j<length; j++) {
|
|
||||||
*data++ = unicode[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
*changed = 1;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
/* any other first byte is invalid (RFC 3629) */
|
|
||||||
else {
|
|
||||||
count++;
|
|
||||||
if(count <= len)
|
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* invalid UTF-8 character number range (RFC 3629) */
|
/* invalid UTF-8 character number range (RFC 3629) */
|
||||||
if ((d >= 0xD800) && (d <= 0xDFFF)) {
|
if ((d >= 0xD800) && (d <= 0xDFFF)) unicode_len = UNICODE_ERROR_RESTRICTED_CHARACTER;
|
||||||
count++;
|
|
||||||
if(count <= len)
|
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check for overlong */
|
/* check for overlong */
|
||||||
if ((unicode_len == 4) && (d < 0x010000)) {
|
if ((unicode_len == 4) && (d < 0x010000)) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER;
|
||||||
/* four byte could be represented with less bytes */
|
|
||||||
count++;
|
|
||||||
if(count <= len)
|
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
else if ((unicode_len == 3) && (d < 0x0800)) {
|
|
||||||
/* three byte could be represented with less bytes */
|
/* three byte could be represented with less bytes */
|
||||||
count++;
|
if ((unicode_len == 3) && (d < 0x0800)) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER;
|
||||||
if(count <= len)
|
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
else if ((unicode_len == 2) && (d < 0x80)) {
|
|
||||||
/* two byte could be represented with less bytes */
|
/* two byte could be represented with less bytes */
|
||||||
count++;
|
if ((unicode_len == 2) && (d < 0x80)) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER;
|
||||||
if(count <= len)
|
|
||||||
*data++ = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (unicode_len > 0) {
|
if (unicode_len > 0) {
|
||||||
i += unicode_len;
|
i += unicode_len;
|
||||||
} else {
|
sprintf(data, "%%u%04x", d);
|
||||||
|
data += 6;
|
||||||
|
*changed = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* any other first byte is invalid (RFC 3629), so assume it's an ASCII character */
|
||||||
|
*data++ = c;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*data = '\0';
|
*data = '\0';
|
||||||
|
|
||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user