diff --git a/CHANGES b/CHANGES index d1b1484b..cf641b02 100644 --- a/CHANGES +++ b/CHANGES @@ -11,6 +11,8 @@ XX NNN 2012 - 2.7.0-rc3 * Added IIS and Ngnix platform code. + * Added new transformation utf8toUnicode. + 23 Jul 2012 - 2.6.7 ------------------- diff --git a/apache2/msc_util.c b/apache2/msc_util.c index 5c9c127b..9af61719 100644 --- a/apache2/msc_util.c +++ b/apache2/msc_util.c @@ -74,6 +74,233 @@ static unsigned char *c2x(unsigned what, unsigned char *where); static unsigned char x2c(unsigned char *what); static unsigned char xsingle2c(unsigned char *what); +/** \brief Validate IPv4 Netmask + * + * \param mp Pointer to memory pool + * \param input Pointer to input data + * \param input_len Input data length + * \param changed Set if data is changed + * + * \retval rval On Success + */ +char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) { + int unicode_len = 0, length = 0; + unsigned int d = 0, count = 0; + unsigned char c, *utf; + char *rval, *data; + unsigned int i, len, j; + unsigned int bytes_left = input_len; + unsigned char *unicode = NULL; + + *changed = 0; + + len = input_len * 7 + 1; + data = rval = apr_palloc(mp, len); + if (rval == NULL) return NULL; + + + if (input == NULL) return NULL; + + for(i = 0; i < bytes_left;) { + unicode_len = 0; d = 0; + utf = (unsigned char *)&input[i]; + + c = *utf; + + /* If first byte begins with binary 0 it is single byte encoding */ + if ((c & 0x80) == 0) { + /* single byte unicode (7 bit ASCII equivilent) has no validation */ + count++; + if(count <= len) + *data++ = c; + } + /* If first byte begins with binary 110 it is two byte encoding*/ + else if ((c & 0xE0) == 0xC0) { + /* check we have at least two bytes */ + if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + /* check second byte starts with binary 10 */ + else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING; + else { + unicode_len = 2; + count+=6; + if(count <= len) { + /* compute character number */ + d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F); + *data++ = '%'; + *data++ = 'u'; + unicode = apr_psprintf(mp, "%x", d); + length = strlen(unicode); + + switch(length) { + case 1: + *data++ = '0'; + *data++ = '0'; + *data++ = '0'; + break; + case 2: + *data++ = '0'; + *data++ = '0'; + break; + case 3: + *data++ = '0'; + break; + case 4: + case 5: + break; + } + + for(j=0; j= 0xF5) { + *data++ = c; + } + /* check we have at least four bytes */ + if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING; + /* check second byte starts with binary 10 */ + else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING; + /* check third byte starts with binary 10 */ + else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING; + /* check forth byte starts with binary 10 */ + else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING; + else { + unicode_len = 4; + count+=7; + if(count <= len) { + /* compute character number */ + d = ((c & 0x07) << 18) | ((*(utf + 1) & 0x3F) << 12) | ((*(utf + 2) & 0x3F) < 6) | (*(utf + 3) & 0x3F); + *data++ = '%'; + *data++ = 'u'; + unicode = apr_psprintf(mp, "%x", d); + length = strlen(unicode); + + switch(length) { + case 1: + *data++ = '0'; + *data++ = '0'; + *data++ = '0'; + break; + case 2: + *data++ = '0'; + *data++ = '0'; + break; + case 3: + *data++ = '0'; + break; + case 4: + case 5: + break; + } + + for(j=0; j= 0xD800) && (d <= 0xDFFF)) { + count++; + if(count <= len) + *data++ = c; + } + + /* check for overlong */ + if ((unicode_len == 4) && (d < 0x010000)) { + /* four byte could be represented with less bytes */ + count++; + if(count <= len) + *data++ = c; + } + else if ((unicode_len == 3) && (d < 0x0800)) { + /* three byte could be represented with less bytes */ + count++; + if(count <= len) + *data++ = c; + } + else if ((unicode_len == 2) && (d < 0x80)) { + /* two byte could be represented with less bytes */ + count++; + if(count <= len) + *data++ = c; + } + + if(unicode_len > 0) { + i += unicode_len; + } else { + i++; + } + } + + *data ='\0'; + + return rval; +} + /** \brief Validate IPv4 Netmask * * \param ip_strv6 Pointer to ipv6 address diff --git a/apache2/msc_util.h b/apache2/msc_util.h index de1fd3ae..76227df8 100644 --- a/apache2/msc_util.h +++ b/apache2/msc_util.h @@ -40,6 +40,14 @@ int DSOLOCAL inet_pton(int family, const char *src, void *dst); #endif #endif +#define UNICODE_ERROR_CHARACTERS_MISSING -1 +#define UNICODE_ERROR_INVALID_ENCODING -2 +#define UNICODE_ERROR_OVERLONG_CHARACTER -3 +#define UNICODE_ERROR_RESTRICTED_CHARACTER -4 +#define UNICODE_ERROR_DECODING_ERROR -5 + +char DSOLOCAL *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed); + char DSOLOCAL *m_strcasestr(const char *haystack, const char *needle); int DSOLOCAL normalize_path_inplace(unsigned char *input, int len, int win, int *changed); diff --git a/apache2/re_operators.c b/apache2/re_operators.c index 3f1f2310..eaea360a 100644 --- a/apache2/re_operators.c +++ b/apache2/re_operators.c @@ -4024,12 +4024,6 @@ static int msre_op_validateUrlEncoding_execute(modsec_rec *msr, msre_rule *rule, /* validateUtf8Encoding */ -#define UNICODE_ERROR_CHARACTERS_MISSING -1 -#define UNICODE_ERROR_INVALID_ENCODING -2 -#define UNICODE_ERROR_OVERLONG_CHARACTER -3 -#define UNICODE_ERROR_RESTRICTED_CHARACTER -4 -#define UNICODE_ERROR_DECODING_ERROR -5 - /* NOTE: This is over-commented for ease of verification */ static int detect_utf8_character(const unsigned char *p_read, unsigned int length) { int unicode_len = 0; diff --git a/apache2/re_tfns.c b/apache2/re_tfns.c index c7b18436..133bfe23 100644 --- a/apache2/re_tfns.c +++ b/apache2/re_tfns.c @@ -495,6 +495,18 @@ static int msre_fn_urlDecodeUni_execute(apr_pool_t *mptmp, unsigned char *input, return changed; } +static int msre_fn_utf8Unicode_execute(apr_pool_t *mptmp, unsigned char *input, + long int input_len, char **rval, long int *rval_len) +{ + int changed = 0; + + *rval = (char *)utf8_unicode_inplace_ex(mptmp, input, input_len, &changed); + *rval_len = strlen(*rval); + + return changed; +} + + /* urlEncode */ static int msre_fn_urlEncode_execute(apr_pool_t *mptmp, unsigned char *input, @@ -1018,6 +1030,12 @@ void msre_engine_register_default_tfns(msre_engine *engine) { msre_fn_urlDecodeUni_execute ); + /* Utf8Unicode */ + msre_engine_tfn_register(engine, + "Utf8toUnicode", + msre_fn_utf8Unicode_execute + ); + /* urlEncode */ msre_engine_tfn_register(engine, "urlEncode",