在上一篇中我们介绍了SQL简单语义分析概述
ailx10:SQL简单语义分析概述zhuanlan.zhihu.com本篇将和大家聊一聊SQL语义引擎中的指纹识别算法,这样也特别的绕,比XSS状态机还要绕,感兴趣的同学可以画画流程图,反正我是没有再画图了(不傻),采用GDB来分析的。
核心函数:int libinjection_sqli_fold(struct libinjection_sqli_state * sf)
疑难局部变量:size_t pos = 0; //下一个token的位置
left是已经折叠或处理的token数量的计数
我通读下来还是感觉有点困惑,所以准备写一篇GDB跟踪过程,欢迎提建议
step1:跳过最初的注释|左括号|一元运算符
第一次计算token
token.type取值TYPE_KEYWORD
;TYPE_UNION
;TYPE_GROUP
;TYPE_EXPRESSION
;TYPE_FUNCTION
;TYPE_BAREWORD
;TYPE_NUMBER
;TYPE_VARIABLE
;TYPE_STRING
;TYPE_OPERATOR
;TYPE_LOGIC_OPERATOR
;TYPE_COLLATE
;TYPE_RIGHTPARENS
;TYPE_LEFTBRACE
;TYPE_RIGHTBRACE
;TYPE_DOT
;TYPE_COMMA
;TYPE_COLON
;TYPE_SEMICOLON
;TYPE_TSQL
;TYPE_UNKNOWN
;TYPE_EVIL
;TYPE_FINGERPRINT
;TYPE_BACKSLASH
进入break分支,跳出当前循环。
token.type取值TYPE_NONE
直接跳出当前循环。
sf->current = &(sf->tokenvec[0]);
while (more) {
more = libinjection_sqli_tokenize(sf);
if ( ! (sf->current->type == TYPE_COMMENT || //'注释c'
sf->current->type == TYPE_LEFTPARENS || //'左括号('
sf->current->type == TYPE_SQLTYPE || //'t'
st_is_unary_op(sf->current))) { //一元运算符 + - ! ~ !! NOT
break;
}
}
step2:准备处理下一个token
if (! more) {
/* 如果全是注释 左括号 一元运算符 ,那么直接退出*/
return 0;
} else {
/* 注意这里的 pos 是局部变量pos ,表示下一个 token 的位置*/
pos += 1;
}
step3.1:while(1)
如果pos>=5
并且满足一定条件:
pos>5
令tokenvec[5]
覆盖tokenvec[1]
再使得pos=2
;left=0
pos=5
只令pos=1
;left=0
如果pos<5
或 pos>=5
但是不满足一定条件 直接跳过这段处理。
if (pos >= LIBINJECTION_SQLI_MAX_TOKENS) {//5
if (
( sf->tokenvec[0].type == TYPE_NUMBER &&//'1'
--> 1os1) | 1,s1) ) ||
( --> no(n) | no(1) ) ||
( --> 1),(1 ) ||
( --> n)o(n ))
{
if (pos > LIBINJECTION_SQLI_MAX_TOKENS)
{
st_copy(&(sf->tokenvec[1]), &(sf->tokenvec[LIBINJECTION_SQLI_MAX_TOKENS]));
pos = 2;
left = 0;
} else {
pos = 1;
left = 0;
}
}
}
step3.2:
if (! more || left >= LIBINJECTION_SQLI_MAX_TOKENS) {
left = pos;
break;
}
step4:第二次计算token
上一个token.type不等于TYPE_NONE
;并且pos<=5
;并且pos - left < 2
条件成立,sf->current
指针指向第pos
个token
,然后计算 token
token.type不等于TYPE_NONE
,
如果 token.type等于TYPE_COMMENT
,用token覆盖last_comment,
如果不等于,last_comment.type置为NULL
;pos +=1
,准备计算下一个token
while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && (pos - left) < 2) {
sf->current = &(sf->tokenvec[pos]);
more = libinjection_sqli_tokenize(sf);
if (more) {
if (sf->current->type == TYPE_COMMENT) {
st_copy(&last_comment, sf->current);
} else {
last_comment.type = CHAR_NULL;
pos += 1;
}
}
}
step5.1:
if (pos - left < 2) {
left = pos;
continue;
}
step5.2:第1次折叠
if ( ss ) {
pos -= 1;
sf->stats_folds += 1;
continue;
} else if ( ;; ) {
pos -= 1;
sf->stats_folds += 1;
continue;
} else if ( [o &] &&
(st_is_unary_op(&sf->tokenvec[left+1]) ||
sf->tokenvec[left+1].type == TYPE_SQLTYPE)) {
pos -= 1;
sf->stats_folds += 1;
left = 0;
continue;
} else if ( ( &&
st_is_unary_op(&sf->tokenvec[left+1])) {
pos -= 1;
sf->stats_folds += 1;
if (left > 0) {
left -= 1;
}
continue;
} else if (syntax_merge_words(sf, &sf->tokenvec[left], &sf->tokenvec[left+1])) {
pos -= 1;
sf->stats_folds += 1;
if (left > 0) {
left -= 1;
}
continue;
} else if ( ;f &&
(sf->tokenvec[left+1].val[0] == 'I' ||
sf->tokenvec[left+1].val[0] == 'i' ) &&
(sf->tokenvec[left+1].val[1] == 'F' ||
sf->tokenvec[left+1].val[1] == 'f' )) {
/* IF通常是函数,但是在Transact-SQL中可以用作独立的控制流操作符
形如; IF 1=1 需要将 f 转换为 T */
sf->tokenvec[left+1].type = TYPE_TSQL;
/* left += 2; */
continue; /* 重新排列 我们可能需要左移 left 和 pos */
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD || sf->tokenvec[left].type == TYPE_VARIABLE) &&
sf->tokenvec[left+1].type == TYPE_LEFTPARENS && (
/* TSQL functions but common enough to be column names */
cstrcasecmp("USER_ID", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("USER_NAME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
/* Function in MYSQL */
cstrcasecmp("DATABASE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("PASSWORD", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
/* Mysql words that act as a variable and are a function */
/* TSQL current_users is fake-variable */
/* http://msdn.microsoft.com/en-us/library/ms176050.aspx */
cstrcasecmp("CURRENT_USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_DATE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_TIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_TIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("LOCALTIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("LOCALTIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
)) {
/* pos is the same
* other conversions need to go here... for instance
* password CAN be a function, coalesce CAN be a function
*/
sf->tokenvec[left].type = TYPE_FUNCTION;
continue;
} else if (sf->tokenvec[left].type == TYPE_KEYWORD && (
cstrcasecmp("IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("NOT IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
)) {
if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
/* got .... IN ( ... (or 'NOT IN')
* it's an operator
*/
sf->tokenvec[left].type = TYPE_OPERATOR;
} else {
/*
* it's a nothing
*/
sf->tokenvec[left].type = TYPE_BAREWORD;
}
/* "IN" can be used as "IN BOOLEAN MODE" for mysql
* in which case merging of words can be done later
* other wise it acts as an equality operator __ IN (values..)
*
* here we got "IN" "(" so it's an operator.
* also back track to handle "NOT IN"
* might need to do the same with like
* two use cases "foo" LIKE "BAR" (normal operator)
* "foo" = LIKE(1,2)
*/
continue;
} else if ((sf->tokenvec[left].type == TYPE_OPERATOR) && (
cstrcasecmp("LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("NOT LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0)) {
if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
/* SELECT LIKE(...
* it's a function
*/
sf->tokenvec[left].type = TYPE_FUNCTION;
}
} else if (sf->tokenvec[left].type == TYPE_SQLTYPE &&
(sf->tokenvec[left+1].type == TYPE_BAREWORD ||
sf->tokenvec[left+1].type == TYPE_NUMBER ||
sf->tokenvec[left+1].type == TYPE_SQLTYPE ||
sf->tokenvec[left+1].type == TYPE_LEFTPARENS ||
sf->tokenvec[left+1].type == TYPE_FUNCTION ||
sf->tokenvec[left+1].type == TYPE_VARIABLE ||
sf->tokenvec[left+1].type == TYPE_STRING)) {
st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
pos -= 1;
sf->stats_folds += 1;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_COLLATE &&
sf->tokenvec[left+1].type == TYPE_BAREWORD) {
/*
* there are too many collation types.. so if the bareword has a "_"
* then it's TYPE_SQLTYPE
*/
if (strchr(sf->tokenvec[left+1].val, '_') != NULL) {
sf->tokenvec[left+1].type = TYPE_SQLTYPE;
left = 0;
}
} else if (sf->tokenvec[left].type == TYPE_BACKSLASH) {
if (st_is_arithmetic_op(&(sf->tokenvec[left+1]))) {
/* very weird case in TSQL where '%1' is parsed as '0 % 1', etc */
sf->tokenvec[left].type = TYPE_NUMBER;
} else {
/* just ignore it.. Again T-SQL seems to parse 1 as "1" */
st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
pos -= 1;
sf->stats_folds += 1;
}
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
} else if (sf->tokenvec[left].type == TYPE_RIGHTPARENS &&
sf->tokenvec[left+1].type == TYPE_RIGHTPARENS) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
} else if (sf->tokenvec[left].type == TYPE_LEFTBRACE &&
sf->tokenvec[left+1].type == TYPE_BAREWORD) {
/*
* MySQL Degenerate case --
*
* select { ``.``.id }; -- valid !!!
* select { ``.``.``.id }; -- invalid
* select ``.``.id; -- invalid
* select { ``.id }; -- invalid
*
* so it appears {``.``.id} is a magic case
* I suspect this is "current database, current table, field id"
*
* The folding code can't look at more than 3 tokens, and
* I don't want to make two passes.
*
* Since "{ ``" so rare, we are just going to blacklist it.
*
* Highly likely this will need revisiting!
*
* CREDIT @rsalgado 2013-11-25
*/
if (sf->tokenvec[left+1].len == 0) {
sf->tokenvec[left+1].type = TYPE_EVIL;
return (int)(left+2);
}
/* weird ODBC / MYSQL {foo expr} --> expr
* but for this rule we just strip away the "{ foo" part
*/
left = 0;
pos -= 2;
sf->stats_folds += 2;
continue;
} else if (sf->tokenvec[left+1].type == TYPE_RIGHTBRACE) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
}
step6:折叠之后,第三次计算token,同第二次处理
while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && pos - left < 3) {
sf->current = &(sf->tokenvec[pos]);
more = libinjection_sqli_tokenize(sf);
if (more) {
if (sf->current->type == TYPE_COMMENT) {
st_copy(&last_comment, sf->current);
} else {
last_comment.type = CHAR_NULL;
pos += 1;
}
}
}
step7.1:
if (pos -left < 3) {
left = pos;
continue;
}
step7.2:第2次折叠
if (sf->tokenvec[left].type == TYPE_NUMBER &&
sf->tokenvec[left+1].type == TYPE_OPERATOR &&
sf->tokenvec[left+2].type == TYPE_NUMBER) {
pos -= 2;
left = 0;
continue;
} else if ( o(o ) {
left = 0;
pos -= 2;
continue;
} else if ( & & ) {
pos -= 2;
left = 0;
continue;
} else if ( vov vo1 von ) {
pos -= 2;
left = 0;
continue;
} else if ( no1 non 1o1 1on ) {
pos -= 2;
left = 0;
continue;
} else if ( not vot sot &&
streq(sf->tokenvec[left+1].val, "::")) {
pos -= 2;
left = 0;
sf->stats_folds += 2;
continue;
} else if ( [n 1 s v] , [1 n s v] ) {
pos -= 2;
left = 0;
continue;
} else if ( [E , B] &&
st_is_unary_op(&sf->tokenvec[left+1]) && ( ) {
/* 形如 SELECT + (, LIMIT + ( 移除一元运算符 */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ( [k E B] &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
[1 n v s f]) {
/* 形如 select - 1 移除一元运算符 */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ( , &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
[1 n v s]) {
/* 形如", -1" --> ",1"
形如"1,-1" --> "1" */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
left = 0;
assert(pos >= 3); /* pos is >= 3 so this is safe */
pos -= 3;
continue;
} else if ( , &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
f ) {
/* 形如 1,-sin(1) --> 1,sin(1) 移除一元运算符 */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ( n.n ) {
/* 形如 databasename.table 忽略.n */
assert(pos >= 3);
pos -= 2;
left = 0;
continue;
} else if ( E.n ) {
/* 形如 select . `foo` --> select `foo` */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ( f( &&
(sf->tokenvec[left+2].type != TYPE_RIGHTPARENS)) {
/* USER() 是函数 User(foo) 不是函数 */
if (cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0) {
sf->tokenvec[left].type = TYPE_BAREWORD;
}
}
step8:
left += 1;
} /* while(1) step3.1 --> step8 */
step9:
if (left < LIBINJECTION_SQLI_MAX_TOKENS && last_comment.type == TYPE_COMMENT) {
st_copy(&sf->tokenvec[left], &last_comment);
left += 1;
}
/* 有时候获取第6个token 来确认第5个token */
if (left > LIBINJECTION_SQLI_MAX_TOKENS) {
left = LIBINJECTION_SQLI_MAX_TOKENS;
}
return (int)left;
如何计算token?
int libinjection_sqli_tokenize(struct libinjection_sqli_state * sf)
{
pt2Function fnptr;
size_t *pos = &sf->pos;
stoken_t *current = sf->current;
const char *s = sf->s;
const size_t slen = sf->slen;
if (slen == 0) {
return FALSE;
}
st_clear(current);
sf->current = current;
/*
* if we are at beginning of string
* and in single-quote or double quote mode
* then pretend the input starts with a quote
*/
if (*pos == 0 && (sf->flags & (FLAG_QUOTE_SINGLE | FLAG_QUOTE_DOUBLE))) {
*pos = parse_string_core(s, slen, 0, current, flag2delim(sf->flags), 0);
sf->stats_tokens += 1;
return TRUE;
}
while (*pos < slen) {
const unsigned char ch = (unsigned char) (s[*pos]);
fnptr = char_parse_map[ch];
*pos = (*fnptr) (sf); // 回调函数 这里很复杂
if (current->type != CHAR_NULL) {
sf->stats_tokens += 1;
return TRUE;
}
}
return FALSE;
}
下回gdb跟踪 ~