From 3e5e2a06b7cbeef3415b18d8b529e6c907e776dc Mon Sep 17 00:00:00 2001
From: brectanus <brectanus@9017d574-64ec-4062-9424-5e00b32a252b>
Date: Tue, 31 Jul 2007 19:04:07 +0000
Subject: [PATCH] Stricter validation for @validateUtf8Encoding. Capture the
 match in TX:0 when using "capture" action w/@pm operators.

---
 CHANGES                |  36 ++++++++++
 apache2/re_operators.c | 156 ++++++++++++++++++++++++++---------------
 2 files changed, 135 insertions(+), 57 deletions(-)

diff --git a/CHANGES b/CHANGES
index 00250a9e..8f0ceae8 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,9 +1,45 @@
 ?? ??? 2007 - 2.5.0-trunk
 -------------------------
 
+ * Stricter validation for @validateUtf8Encoding.
+
+ * Capture the match in TX:0 when using "capture" action in phrase match
+   operators.
+
  * Added Cygwin to the list of platforms not supporting the hidden
    visibility attribute.
 
+
+27 July 2007 - 2.1.2
+--------------------
+
+ * Update included core rules to latest version (1.4.3).
+
+ * Enhanced ability to alert/audit failed requests.
+
+ * Do not trigger "pause" action for internal requests.
+
+ * Fixed issue with requests that use internal requests.  These had the
+   potential to be intercepted incorrectly when other Apache httpd modules
+   that used internal requests were used with mod_security.
+
+ * Added Solaris and Cygwin to the list of platforms not supporting the hidden
+   visibility attribute.
+
+ * Fixed decoding full-width unicode in t:urlDecodeUni.
+
+ * Lessen some overhead of debugging messages and calculations.
+
+ * Do not try to intercept a request after a failed rule.  This fixes the
+   issue associated with an "Internal Error: Asked to intercept request
+   but was_intercepted is zero" error message.
+
+ * Added SecAuditLog2 directive to allow redundent concurrent audit log
+   index files.  This will allow sending audit data to two consoles, etc.
+
+ * Small performance improvement in memory management for rule execution.
+
+
 21 June 2007 - 2.5.0-dev2
 -------------------------
 
diff --git a/apache2/re_operators.c b/apache2/re_operators.c
index ea723d81..44be6b5b 100644
--- a/apache2/re_operators.c
+++ b/apache2/re_operators.c
@@ -308,10 +308,14 @@ static int msre_op_pmFromFile_param_init(msre_rule *rule, char **error_msg) {
 static int msre_op_pm_execute(modsec_rec *msr, msre_rule *rule, msre_var *var, char **error_msg) {
     const char *match = NULL;
     apr_status_t rc = 0;
+    int capture;
     
     /* Nothing to read */
     if ((var->value == NULL) || (var->value_len == 0)) return 0;
 
+    /* Are we supposed to capture subexpressions? */
+    capture = apr_table_get(rule->actionset->actions, "capture") ? 1 : 0;
+
     ACMPT pt = {(ACMP *)rule->op_param_data, NULL};
 
     rc = acmp_process_quick(&pt, &match, var->value, var->value_len);
@@ -326,6 +330,33 @@ static int msre_op_pm_execute(modsec_rec *msr, msre_rule *rule, msre_var *var, c
             *error_msg = apr_psprintf(msr->mp, "Matched phrase \"%s\" at %s.",
                 match_escaped, var->name);
         }
+
+        /* Handle capture as tx.0=match */
+        if (capture) {
+            int i;
+            msc_string *s = (msc_string *)apr_pcalloc(msr->mp, sizeof(msc_string));
+
+            if (s == NULL) return -1;
+
+            s->name = "0";
+            s->value = apr_pstrdup(msr->mp, match);
+            if (s->value == NULL) return -1;
+            s->value_len = strlen(s->value);
+            apr_table_setn(msr->tx_vars, s->name, (void *)s);
+
+            if (msr->txcfg->debuglog_level >= 9) {
+                msr_log(msr, 9, "Adding phrase match to TXVARS (0): %s",
+                    log_escape_nq_ex(msr->mp, s->value, s->value_len));
+            }
+
+            /* Unset the remaining ones (from previous invocations). */
+            for(i = rc; i <= 9; i++) {
+                char buf[2];
+                apr_snprintf(buf, sizeof(buf), "%i", i);
+                apr_table_unset(msr->tx_vars, buf);
+            }
+        }
+
         return 1;
     }
     return rc;
@@ -1145,89 +1176,91 @@ static int msre_op_validateUrlEncoding_execute(modsec_rec *msr, msre_rule *rule,
 #define UNICODE_ERROR_CHARACTERS_MISSING    -1
 #define UNICODE_ERROR_INVALID_ENCODING      -2
 #define UNICODE_ERROR_OVERLONG_CHARACTER    -3
+#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
+#define UNICODE_ERROR_DECODING_ERROR        -5
 
+/* NOTE: This is over-commented for ease of verification */
 static int detect_utf8_character(const char *p_read, unsigned int length) {
     int unicode_len = 0;
     unsigned int d = 0;
     unsigned char c;
 
-    if (p_read == NULL) return 0;
+    if (p_read == NULL) return UNICODE_ERROR_DECODING_ERROR;
     c = *p_read;
-    if (c == 0) return 0;
 
-    if ((c & 0xE0) == 0xC0) {
-        /* two byte unicode */
+    /* If first byte begins with binary 0 it is single byte encoding */
+    if ((c & 0x80) == 0) {
+        /* single byte unicode (7 bit ASCII equivilent) has no validation */
+        return 1;
+    }
+    /* If first byte begins with binary 110 it is two byte encoding*/
+    else if ((c & 0xE0) == 0xC0) {
+        /* check we have at least two bytes */
         if (length < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
-        else
-        if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check second byte starts with binary 10 */
+        else if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
         else {
             unicode_len = 2;
+            /* compute character number */
             d = ((c & 0x1F) << 6) | (*(p_read + 1) & 0x3F);
         }
     }
+    /* If first byte begins with binary 1110 it is three byte encoding */
     else if ((c & 0xF0) == 0xE0) {
-        /* three byte unicode */
+        /* check we have at least three bytes */
         if (length < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
-        else
-        if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check second byte starts with binary 10 */
+        else if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check third byte starts with binary 10 */
+        else if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
         else {
             unicode_len = 3;
+            /* compute character number */
             d = ((c & 0x0F) << 12) | ((*(p_read + 1) & 0x3F) << 6) | (*(p_read + 2) & 0x3F);
         }
     }
+    /* If first byte begins with binary 11110 it is four byte encoding */
     else if ((c & 0xF8) == 0xF0) {
-        /* four byte unicode */
+        /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
+        if (c >= 0xF5) {
+            return UNICODE_ERROR_RESTRICTED_CHARACTER;
+        }
+        /* check we have at least four bytes */
         if (length < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
-        else
-        if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check second byte starts with binary 10 */
+        else if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check third byte starts with binary 10 */
+        else if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+        /* check forth byte starts with binary 10 */
+        else if (((*(p_read + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
         else {
-            d = ((c & 0x07) << 18) | ((*(p_read + 1) & 0x3F) << 12) | ((*(p_read + 2) & 0x3F) < 6) | (*(p_read + 3) & 0x3F);
             unicode_len = 4;
+            /* compute character number */
+            d = ((c & 0x07) << 18) | ((*(p_read + 1) & 0x3F) << 12) | ((*(p_read + 2) & 0x3F) < 6) | (*(p_read + 3) & 0x3F);
         }
     }
-    else if ((c & 0xFC) == 0xF8) {
-        /* five byte unicode */
-        if (length < 5) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
-        else
-        if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 4)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else {
-            d = ((c & 0x03) << 24) | ((*(p_read + 1) & 0x3F) << 18) | ((*(p_read + 2) & 0x3F) << 12) | ((*(p_read + 3) & 0x3F) << 6) | (*(p_read + 4) & 0x3F);
-            unicode_len = 5;
-        }
-    }
-    else if ((c & 0xFE) == 0xFC) {
-        /* six byte unicode */
-        if (length < 6) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
-        else
-        if (((*(p_read + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 4)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else
-        if (((*(p_read + 5)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
-        else {
-            d = ((c & 0x01) << 30) | ((*(p_read + 1) & 0x3F) << 24) | ((*(p_read + 2) & 0x3F) << 18) | ((*(p_read + 3) & 0x3F) << 12) | ((*(p_read + 4) & 0x3F) << 6) | (*(p_read + 5) & 0x3F);
-            unicode_len = 6;
-        }
+    /* any other first byte is invalid (RFC 3629) */
+    else {
+        return UNICODE_ERROR_INVALID_ENCODING;
     }
 
-    if ((unicode_len > 1)&&((d & 0x7F) == d)) {
-        unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER;
+    /* invalid UTF-8 character number range (RFC 3629) */
+    if ((d >= 0xD800) && (d <= 0xDFFF)) {
+        return UNICODE_ERROR_RESTRICTED_CHARACTER;
+    }
+
+    /* check for overlong */
+    if ((unicode_len == 4) && (d < 0x010000)) {
+        /* four byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
+    }
+    else if ((unicode_len == 3) && (d < 0x0800)) {
+        /* three byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
+    }
+    else if ((unicode_len == 2) && (d < 0x80)) {
+        /* two byte could be represented with less bytes */
+        return UNICODE_ERROR_OVERLONG_CHARACTER;
     }
 
     return unicode_len;
@@ -1239,6 +1272,7 @@ static int msre_op_validateUtf8Encoding_execute(modsec_rec *msr, msre_rule *rule
     unsigned int i, bytes_left;
 
     bytes_left = var->value_len;
+
     for(i = 0; i < var->value_len; i++) {
         int rc = detect_utf8_character(&var->value[i], bytes_left);
         switch(rc) {
@@ -1248,18 +1282,26 @@ static int msre_op_validateUtf8Encoding_execute(modsec_rec *msr, msre_rule *rule
                 return 1;
                 break;
             case UNICODE_ERROR_INVALID_ENCODING :
-                *error_msg = apr_psprintf(msr->mp, "Invalid Unicode encoding: invalid byte value "
+                *error_msg = apr_psprintf(msr->mp, "Invalid UTF-8 encoding: invalid byte value "
                     "in character.");
                 return 1;
                 break;
             case UNICODE_ERROR_OVERLONG_CHARACTER :
-                *error_msg = apr_psprintf(msr->mp, "Invalid Unicode encoding: overlong "
+                *error_msg = apr_psprintf(msr->mp, "Invalid UTF-8 encoding: overlong "
                     "character detected.");
                 return 1;
                 break;
+            case UNICODE_ERROR_RESTRICTED_CHARACTER :
+                *error_msg = apr_psprintf(msr->mp, "Invalid UTF-8 encoding: use of restricted character");
+                return 1;
+                break;
+            case UNICODE_ERROR_DECODING_ERROR :
+                *error_msg = apr_psprintf(msr->mp, "Error validating UTF-8 decoding");
+                return 1;
+                break;
         }
 
-        bytes_left--;
+        bytes_left -= rc;
     }
 
     return 0;