From 4bc068e58d8485e94695a5a7425a38fb3e831a0b Mon Sep 17 00:00:00 2001
From: Nick Galbreath <nickg@client9.com>
Date: Mon, 6 May 2013 22:29:23 +0900
Subject: [PATCH] Sync to 1.2.0, fix regression in small sqli detection

---
 apache2/libinjection/sqlparse.c         | 362 +++++++++++++++++-------
 apache2/libinjection/sqlparse.h         |   2 +-
 apache2/libinjection/sqlparse_private.h |  32 ---
 3 files changed, 259 insertions(+), 137 deletions(-)

diff --git a/apache2/libinjection/sqlparse.c b/apache2/libinjection/sqlparse.c
index a8582f66..43c51343 100644
--- a/apache2/libinjection/sqlparse.c
+++ b/apache2/libinjection/sqlparse.c
@@ -68,7 +68,15 @@ memchr2(const char *haystack, size_t haystack_len, char c0, char c1)
     return NULL;
 }
 
-
+/** Find largest string containing certain characters.
+ *
+ * C Standard library 'strspn' only works for 'c-strings' (null terminated)
+ * This works on arbitrary length.
+ *
+ * Porting notes:
+ *   if accept is 'ABC', then this function would be similar to
+ *   a_regexp.match(a_str, '[ABC]*'),
+ */
 size_t strlenspn(const char *s, size_t len, const char *accept)
 {
     size_t i;
@@ -102,11 +110,96 @@ int cstrcasecmp(const char *a, const char *b)
     return ca - cb;
 }
 
+/**
+ * Case insentive string compare.
+ *  Here only to make code more readable
+ */
 int streq(const char *a, const char *b)
 {
     return cstrcasecmp(a, b) == 0;
 }
 
+/*
+ * Case-sensitive binary search.
+ *
+ */
+int bsearch_cstr(const char *key, const char *base[], size_t nmemb)
+{
+    int left = 0;
+    int right = (int) nmemb - 1;
+
+    while (left <= right) {
+        int pos = (left + right) / 2;
+        int cmp = strcmp(base[pos], key);
+        if (cmp == 0) {
+            return TRUE;
+        } else if (cmp < 0) {
+            left = pos + 1;
+        } else {
+            right = pos - 1;
+        }
+    }
+    return FALSE;
+}
+
+/*
+ * Case-insensitive binary search
+ */
+int bsearch_cstrcase(const char *key, const char *base[], size_t nmemb)
+{
+    int left = 0;
+    int right = (int) nmemb - 1;
+
+    while (left <= right) {
+        int pos = (left + right) / 2;
+        int cmp = cstrcasecmp(base[pos], key);
+        if (cmp == 0) {
+            return TRUE;
+        } else if (cmp < 0) {
+            left = pos + 1;
+        } else {
+            right = pos - 1;
+        }
+    }
+    return FALSE;
+}
+
+/**
+ *
+ *
+ *
+ * Porting Notes:
+ *  given a mapping/hash of string to char
+ *  this is just
+ *     mapping[key.upper()]
+ */
+char bsearch_keyword_type(const char *key, const keyword_t * keywords,
+                          size_t numb)
+{
+    int left = 0;
+    int right = (int) numb - 1;
+
+    while (left <= right) {
+        int pos = (left + right) / 2;
+        int cmp = cstrcasecmp(keywords[pos].word, key);
+        if (cmp == 0) {
+            return keywords[pos].type;
+        } else if (cmp < 0) {
+            left = pos + 1;
+        } else {
+            right = pos - 1;
+        }
+    }
+    return CHAR_NULL;
+}
+
+/* st_token methods
+ *
+ * The folow just manipulates the stoken_t type
+ *
+ *
+ */
+
 void st_clear(stoken_t * st)
 {
     st->type = CHAR_NULL;
@@ -141,74 +234,11 @@ void st_copy(stoken_t * dest, const stoken_t * src)
     memcpy(dest, src, sizeof(stoken_t));
 }
 
-const char *bsearch_cstrcase(const char *key, const char *base[], size_t nmemb)
-{
-    int left = 0;
-    int right = (int) nmemb - 1;
-
-    while (left <= right) {
-        int pos = (left + right) / 2;
-        int cmp = cstrcasecmp(base[pos], key);
-        if (cmp == 0) {
-            return base[pos];
-        } else if (cmp < 0) {
-            left = pos + 1;
-        } else {
-            right = pos - 1;
-        }
-    }
-    return NULL;
-}
-
-const char *bsearch_cstr(const char *key, const char *base[], size_t nmemb)
-{
-    int left = 0;
-    int right = (int) nmemb - 1;
-
-    while (left <= right) {
-        int pos = (left + right) / 2;
-        int cmp = strcmp(base[pos], key);
-        if (cmp == 0) {
-            return base[pos];
-        } else if (cmp < 0) {
-            left = pos + 1;
-        } else {
-            right = pos - 1;
-        }
-    }
-    return NULL;
-}
-
-char bsearch_keyword_type(const char *key, const keyword_t * keywords,
-                          size_t numb)
-{
-    int left = 0;
-    int right = (int) numb - 1;
-
-    while (left <= right) {
-        int pos = (left + right) / 2;
-        int cmp = cstrcasecmp(keywords[pos].word, key);
-        if (cmp == 0) {
-            return keywords[pos].type;
-        } else if (cmp < 0) {
-            left = pos + 1;
-        } else {
-            right = pos - 1;
-        }
-    }
-    return CHAR_NULL;
-}
-
-int is_operator2(const char *key)
-{
-    return bsearch_cstrcase(key, operators2, operators2_sz) != NULL;
-}
-
 int st_is_multiword_start(const stoken_t * st)
 {
     return bsearch_cstrcase(st->val,
-                        multikeywords_start,
-                        multikeywords_start_sz) != NULL;
+                            multikeywords_start,
+                            multikeywords_start_sz);
 }
 
 int st_is_unary_op(const stoken_t * st)
@@ -236,6 +266,12 @@ int st_is_arith_op(const stoken_t * st)
                                  cstrcasecmp(st->val, "DIV")));
 }
 
+/* Parsers
+ *
+ *
+ */
+
+
 size_t parse_white(sfilter * sf)
 {
     return sf->pos + 1;
@@ -406,6 +442,9 @@ size_t parse_backslash(sfilter * sf)
     const size_t slen = sf->slen;
     size_t pos = sf->pos;
 
+    /*
+     * Weird MySQL alias for NULL, "\N" (capital N only)
+     */
     if (pos + 1 < slen && cs[pos + 1] == 'N') {
         st_assign(current, '1', "NULL", 4);
         return pos + 2;
@@ -414,6 +453,14 @@ size_t parse_backslash(sfilter * sf)
     }
 }
 
+/** Is input a 2-char operator?
+ *
+ */
+int is_operator2(const char *key)
+{
+    return bsearch_cstr(key, operators2, operators2_sz);
+}
+
 size_t parse_operator2(sfilter * sf)
 {
     stoken_t *current = &sf->syntax_current;
@@ -703,19 +750,44 @@ int parse_token(sfilter * sf)
 
     st_clear(current);
 
+    /*
+     * if we are at beginning of string
+     *  and in single-quote or double quote mode
+     *  then pretend the input starts with a quote
+     */
     if (*pos == 0 && sf->delim != CHAR_NULL) {
         *pos = parse_string_core(s, slen, 0, current, sf->delim, 0);
         return TRUE;
     }
 
     while (*pos < slen) {
+        /*
+         * get current character
+         */
         const int ch = (int) (s[*pos]);
+
+        /*
+         * if not ascii, then continue...
+         *   actually probably need to just assuming
+         *   it's a string
+         */
         if (ch < 0 || ch > 127) {
             *pos += 1;
             continue;
         }
+
+        /*
+         * look up the parser, and call it
+         *
+         * Porting Note: this is mapping of char to function
+         *   charparsers[ch]()
+         */
         fnptr = char_parse_map[ch];
         *pos = (*fnptr) (sf);
+
+        /*
+         *
+         */
         if (current->type != CHAR_NULL) {
             return TRUE;
         }
@@ -770,6 +842,10 @@ int syntax_merge_words(stoken_t * a, stoken_t * b)
     }
 }
 
+/* This does some simple syntax cleanup based on the token
+ *
+ *
+ */
 int sqli_tokenize(sfilter * sf, stoken_t * sout)
 {
     stoken_t *last = &sf->syntax_last;
@@ -777,6 +853,10 @@ int sqli_tokenize(sfilter * sf, stoken_t * sout)
 
     while (parse_token(sf)) {
         char ttype = current->type;
+
+        /*
+         * TBD: hmm forgot logic here.
+         */
         if (ttype == 'c') {
             st_copy(&sf->syntax_comment, current);
             continue;
@@ -784,7 +864,13 @@ int sqli_tokenize(sfilter * sf, stoken_t * sout)
         st_clear(&sf->syntax_comment);
 
         /*
-         * If we don't have a saved token
+         * If we don't have a saved token, and we have
+         * a string: save it.  if the next token is also a string
+         *   then merge them.  e.g. "A" "B" in SQL is actually "AB"
+         * a n/k/U/o type: save since next token my be merged together
+         *   for example: "LEFT" + "JOIN" = "LEFT JOIN"
+         * a o/& type: TBD need to review.
+         *
          */
         if (last->type == CHAR_NULL) {
             switch (ttype) {
@@ -909,6 +995,9 @@ int sqli_tokenize(sfilter * sf, stoken_t * sout)
         st_clear(last);
         return TRUE;
     } else if (sf->syntax_comment.type) {
+        /*
+         * TBD
+         */
         st_copy(sout, &sf->syntax_comment);
         st_clear(&sf->syntax_comment);
         return TRUE;
@@ -917,6 +1006,9 @@ int sqli_tokenize(sfilter * sf, stoken_t * sout)
     }
 }
 
+/*
+ * My apologies, this code is a mess
+ */
 int filter_fold(sfilter * sf, stoken_t * sout)
 {
     stoken_t *last = &sf->fold_last;
@@ -926,7 +1018,7 @@ int filter_fold(sfilter * sf, stoken_t * sout)
         st_copy(sout, last);
         sf->fold_state = 2;
         st_clear(last);
-        return TRUE;
+        return FALSE;
     }
 
     while (sqli_tokenize(sf, current)) {
@@ -952,7 +1044,7 @@ int filter_fold(sfilter * sf, stoken_t * sout)
                 st_copy(last, current);
             }
             st_copy(sout, current);
-            return TRUE;
+            return FALSE;
         } else if (last->type == '(' && st_is_unary_op(current)) {
             /*
              * similar to beginning of statement
@@ -965,7 +1057,7 @@ int filter_fold(sfilter * sf, stoken_t * sout)
              * emit 1, but keep state
              */
             st_copy(sout, current);
-            return TRUE;
+            return FALSE;
         } else if ((last->type == '1' || last->type == 'n')
                    && st_is_arith_op(current)) {
             FOLD_DEBUG;
@@ -987,7 +1079,7 @@ int filter_fold(sfilter * sf, stoken_t * sout)
                     st_copy(sout, current);
                     st_clear(last);
                 }
-                return TRUE;
+                return FALSE;
             } else {
                 if (last->type == 'o') {
                     st_copy(sout, last);
@@ -998,7 +1090,7 @@ int filter_fold(sfilter * sf, stoken_t * sout)
                     st_copy(sout, current);
                     st_clear(last);
                 }
-                return TRUE;
+                return FALSE;
             }
         }
     }
@@ -1007,52 +1099,54 @@ int filter_fold(sfilter * sf, stoken_t * sout)
         if (st_is_arith_op(last)) {
             st_copy(sout, last);
             st_clear(last);
-            return TRUE;
+            return FALSE;
         } else {
             st_clear(last);
         }
     }
 
-    return FALSE;
+    /*
+     * all done: nothing more to parse
+     */
+    return TRUE;
 }
 
+/* secondary api: detects SQLi in a string, GIVEN a context.
+ *
+ * A context can be:
+ *   *  CHAR_NULL (\0), process as is
+ *   *  CHAR_SINGLE ('), process pretending input started with a
+ *          single quote.
+ *   *  CHAR_DOUBLE ("), process pretending input started with a
+ *          double quote.
+ *
+ */
 int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
                     const char delim, ptr_fingerprints_fn fn)
 {
-    int all_done = 0;
     int tlen = 0;
+    char ch;
     int patmatch;
+    int all_done;
 
     sfilter_reset(sql_state, s, slen);
     sql_state->delim = delim;
 
     while (tlen < MAX_TOKENS) {
         all_done = filter_fold(sql_state, &(sql_state->tokenvec[tlen]));
-        if (!all_done) {
+        if (all_done) {
             break;
         }
 
         sql_state->pat[tlen] = sql_state->tokenvec[tlen].type;
         tlen += 1;
     }
-    sql_state->pat[tlen] = CHAR_NULL;
 
     /*
-     * if token 5 (last) looks like a functino word (such as ABS or ASCII)
-     * then check token 6 to see if it's a "(".
-     * if NOT then, it's not a function.
+     * make the fingerprint pattern a c-string (null delimited)
      */
+    sql_state->pat[tlen] = CHAR_NULL;
 
-    if (tlen == MAX_TOKENS && !all_done
-        && sql_state->pat[MAX_TOKENS - 1] == 'f') {
-
-        stoken_t tmp;
-        all_done = filter_fold(sql_state, &tmp);
-        if (!all_done && tmp.type != '(') {
-            sql_state->reason = __LINE__;
-            return FALSE;
-        }
-    }
     /*
      * check for 'X' in pattern
      * this means parsing could not be done
@@ -1066,10 +1160,22 @@ int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
 
     patmatch = fn(sql_state->pat);
 
+    /*
+     * No match.
+     *
+     * Set sql_state->reason to current line number
+     * only for debugging purposes.
+     */
     if (!patmatch) {
         sql_state->reason = __LINE__;
         return FALSE;
     }
+
+    /*
+     * We got a SQLi match
+     * This next part just helps reduce false positives.
+     *
+     */
     switch (tlen) {
     case 2:{
         /*
@@ -1090,16 +1196,6 @@ int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
                 sql_state->reason = __LINE__;
                 return FALSE;
         }
-        /*
-         * detect obvious sqli scans.. many people put '--' in plain text
-         * so only detect if input ends with '--', e.g. 1-- but not 1-- foo
-         */
-
-        if ((strlen(sql_state->tokenvec[1].val) > 2)
-            && sql_state->tokenvec[1].val[0] == '-') {
-            sql_state->reason = __LINE__;
-            return FALSE;
-        }
 
         /**
          * there are some odd base64-looking query string values
@@ -1107,14 +1203,46 @@ int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
          * which evaluate to "1c"... these are not SQLi
          * but 1234-- probably is.
          * Make sure the "1" in "1c" is actually a true decimal number
+         *
+         * Need to check -original- string since the folding step
+         * may have merged tokens, e.g. "1+FOO" is folded into "1"
          */
-        if (sql_state->tokenvec[0].type == '1'&& sql_state->tokenvec[1].type == 'c' &&
-            strlen(sql_state->tokenvec[0].val) != strcspn(sql_state->tokenvec[0].val, "0123456789")) {
+        if (sql_state->tokenvec[0].type == '1'&& sql_state->tokenvec[1].type == 'c') {
+            /*
+             * we check that next character after the number is either whitespace,
+             * or '/' or a '-' ==> sqli.
+             */
+            ch = sql_state->s[strlen(sql_state->tokenvec[0].val)];
+            if ( ch <= 32 ) {
+                /* next char was whitespace,e.g. "1234 --"
+                 * this isn't exactly correct.. ideally we should skip over all whitespace
+                 * but this seems to be ok for now
+                 */
+                return TRUE;
+            }
+            if (ch == '/' && sql_state->s[strlen(sql_state->tokenvec[0].val) + 1] == '*') {
+                return TRUE;
+            }
+            if (ch == '-' && sql_state->s[strlen(sql_state->tokenvec[0].val) + 1] == '-') {
+                return TRUE;
+            }
+
             sql_state->reason = __LINE__;
             return FALSE;
         }
-        break;
+
+        /*
+         * detect obvious sqli scans.. many people put '--' in plain text
+         * so only detect if input ends with '--', e.g. 1-- but not 1-- foo
+         */
+        if ((strlen(sql_state->tokenvec[1].val) > 2)
+            && sql_state->tokenvec[1].val[0] == '-') {
+            sql_state->reason = __LINE__;
+            return FALSE;
         }
+
+        break;
+    } /* case 2 */
     case 3:{
         /*
          * ...foo' + 'bar...
@@ -1138,7 +1266,7 @@ int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
                 }
                 break;
         }
-    }                       /* case 3 */
+    }  /* case 3 */
     case 5: {
         if (streq(sql_state->pat, "sosos")) {
             if (sql_state->tokenvec[0].str_open == CHAR_NULL) {
@@ -1157,30 +1285,56 @@ int is_string_sqli(sfilter * sql_state, const char *s, size_t slen,
         }
     } /* case 5 */
     } /* end switch */
+
     return TRUE;
 }
 
+/**  Main API, detects SQLi in an input.
+ *
+ *
+ */
 int is_sqli(sfilter * sql_state, const char *s, size_t slen,
             ptr_fingerprints_fn fn)
 {
 
+    /*
+     * no input? not sqli
+     */
     if (slen == 0) {
         return FALSE;
     }
 
+    /*
+     * test input "as-is"
+     */
     if (is_string_sqli(sql_state, s, slen, CHAR_NULL, fn)) {
         return TRUE;
     }
 
+    /*
+     * if input has a single_quote, then
+     * test as if input was actually '
+     * example: if input if "1' = 1", then pretend it's
+     *   "'1' = 1"
+     * Porting Notes: example the same as doing
+     *   is_string_sqli(sql_state, "'" + s, slen+1, NULL, fn)
+     *
+     */
     if (memchr(s, CHAR_SINGLE, slen)
         && is_string_sqli(sql_state, s, slen, CHAR_SINGLE, fn)) {
         return TRUE;
     }
 
+    /*
+     * same as above but with a double-quote "
+     */
     if (memchr(s, CHAR_DOUBLE, slen)
         && is_string_sqli(sql_state, s, slen, CHAR_DOUBLE, fn)) {
         return TRUE;
     }
 
+    /*
+     * Hurray, input is not SQLi
+     */
     return FALSE;
 }
diff --git a/apache2/libinjection/sqlparse.h b/apache2/libinjection/sqlparse.h
index f626aaa7..838a0471 100644
--- a/apache2/libinjection/sqlparse.h
+++ b/apache2/libinjection/sqlparse.h
@@ -34,7 +34,7 @@ extern "C" {
  * See python's normalized version
  * http://www.python.org/dev/peps/pep-0386/#normalizedversion
  */
-#define LIBINJECTION_VERSION "1.1.0"
+#define LIBINJECTION_VERSION "1.2.0"
 
 #define ST_MAX_SIZE 32
 #define MAX_TOKENS 5
diff --git a/apache2/libinjection/sqlparse_private.h b/apache2/libinjection/sqlparse_private.h
index e080a009..03f1bccc 100644
--- a/apache2/libinjection/sqlparse_private.h
+++ b/apache2/libinjection/sqlparse_private.h
@@ -13,36 +13,6 @@
 
 #include "sqlparse.h"
 
-/***
- * The stdlib function 'strspn' assumes input to null-delimited.
- * This allows us to specifying and input length and allows
- * embedded nulls
- */
-size_t strlenspn(const char *s, size_t len, const char *accept);
-
-int streq(const char *a, const char *b);
-
-void st_clear(stoken_t * st);
-void st_assign_char(stoken_t * st, const char stype, const char value);
-void st_set_type(stoken_t * st, const char stype);
-void st_assign(stoken_t * st, const char stype, const char *value,
-               size_t len);
-void st_assign_cstr(stoken_t * st, const char stype, const char *value);
-void st_copy(stoken_t * dest, const stoken_t * src);
-
-int st_equals_cstr(const stoken_t * src, const char stype,
-                    const char *value);
-
-int st_is_empty(const stoken_t * st);
-int st_is_arith_op(const stoken_t * st);
-int st_is_unary_op(const stoken_t * st);
-int st_is_english_op(const stoken_t * st);
-int st_is_logical_op(const stoken_t * st);
-int st_is_multiword_start(const stoken_t * st);
-
-const char *bsearch_cstr(const char *key, const char *base[],
-                         size_t nmemb);
-
 typedef struct {
     const char *word;
     char type;
@@ -96,7 +66,5 @@ int sqli_tokenize(sfilter * sf, stoken_t * sout);
 
 int filter_fold(sfilter * sf, stoken_t * sout);
 
-int char2int(char c);
-unsigned long long pat2int(const char *pat);
 
 #endif /* _SQLPARSE_PRIVATE_H */