1 什么是词法分析?
简单来说 就是一个把字符串分割成一个个 关键字或者标识符的过程。
基础知识
/* src/backend/parser/scan.l */代码注释:
/* 定义段 /
…
// %x 申明了排他的开始条件
%x xb / 二进制字符串;例如:bB’0101’ /
%x xc / C语言风格的comment;例如:/* comment * / /
%x xd / 使用双引号括起来的标识符;例如:“colname” /
%x xh / 十六进制字符串;例如:xX’FE5F’ /
%x xq / 基本的单引号字符串;例如:‘string’ /
%x xe / 扩展的单引号字符串,支持转义字符\ ;例如:‘string ’ string2’ /
%x xdolq / 采用
符号括着的字符串
,
例如:
符号括着的字符串,例如:
符号括着的字符串,例如:foo$ /
%x xui / 使用Unicode括起来的标识符,例如:uU"FE5F" /
%x xuiend / xui 的结束 /
%x xus / 使用Unicode括起来的字符串,例如:uU"FE5F" /
%x xusend / xus 的结束 /
%x xeu / xe里面的Unicode字符串 例如:‘string \uD5EF string2’ */
space [ \t\n\r\f] // 空白字符
horiz_space [ \t\f]
newline [\n\r] // 换行字符
non_newline [^\n\r]
comment (“–”{non_newline}*) // 行末注释
whitespace ({space}+|{comment}) // 行末空白字符及注释
special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}{newline}{special_whitespace})
/单引号/
quote ’ /【取消’】/
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*“-”
/二进制字符串的开始、内部不能含有’ /
xbstart [bB]{quote}
xbinside [^'] /【取消’】*/
/十六进制字符串的开始、内部不能含有’ /
xhstart [xX]{quote}
xhinside [^'] /【取消’】*/
/*nchar字符串的开始 */
xnstart [nN]{quote}
/* 扩展的单引号字符串,支持转义字符\ /
xestart [eE]{quote}
xeinside [^\']+ /【取消’】*/
xeescape [\][^0-7]
xeoctesc [\][0-7]{1,3}
xehexesc [\]x[0-9A-Fa-f]{1,2}
xeunicode \
xeunicodefail \
/* 基本的单引号字符串,内部不能含有’ /
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+ /【取消’】*/
/*
f
o
o
foo
foo 样式的括号 /
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim $({dolq_start}{dolq_cont})?$
dolqfailed ${dolq_start}{dolq_cont}*
dolqinside [^$]+
/* 使用双引号括起来的标识符 /
dquote " /【取消"】/
xdstart {dquote}
xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+ /【取消"】*/
digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9$]
/* 标识符 /
identifier {ident_start}{ident_cont}
/* 操作符 */
equals_greater “=>”
less_equals “<=”
greater_equals “>=”
less_greater “<>”
not_equals “!=”
self [,()[].;:+-*/%^<>=]
op_chars [~!@#^&|`?+-*/%<>=]
operator {op_chars}+
integer {digit}+
other .
%%
/* 规则段 /
{whitespace} {
/ 忽略空白字符 /
}
/ comment 注释*/
{xcstart} {
/* comment 开始*/
SET_YYLLOC();
yyextra->xcdepth = 0;
BEGIN(xc); // 设置comment开始条件
/* Put back any characters past slash-star; see above */
yyless(2);
}
{xcstart} {
(yyextra->xcdepth)++; // comment层次加1
/* Put back any characters past slash-star; see above */
yyless(2);
}
{xcstop} {
if (yyextra->xcdepth <= 0)
BEGIN(INITIAL); // 退出comment条件
else
(yyextra->xcdepth)–; // comment层次减1
}
/* 基本的单引号字符串*/
{xqstart} {
yyextra->warn_on_first_escape = true;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
if (yyextra->standard_conforming_strings)
BEGIN(xq);
else
BEGIN(xe);
startlit();
}
/* 扩展的单引号字符串 */
{xestart} {
yyextra->warn_on_first_escape = false;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
BEGIN(xe);
startlit();
}
/结束符/
<> {
SET_YYLLOC();
yyterminate();
}
基本流程
首先基本函数
1 psqlscan_emit函数是psqlscan.l中的ECHO宏,负责匹配词法后,把数据整理到PsqlScanState->output_buf中。
2 基本所有的语法匹配完了都会调psqlscan_emit,所以调试语法树挂这个函数。
你从客户端输入一个语句:
在源码 mainloop.c文件中入口:
psql_scan(can_state, query_buf, &prompt_tmp);
1 会一行一行的读取,换句话说,如上图的语句 他首先就会读入字符串“CREATE FUNCTION f1(a IN int) ”作为 第一次处理。接着调用lex处理这一行字符串。
2 处理完毕 才读入“RETURN int” 作为 第二次又循环处理 第一步步骤。
LEX 就是我们的scan.l 文件 由于是psql端 所以全名是psql_scan.l 文件
psql_scan.l 文件解析
说明:cur_state 是一个词法分析时的一个状态 记录器 会一直伴随你的词法分析。定义在:
psqlscan_int.h 各个字段定义 和意义可以看注释内容:
Ptypedef struct PsqlScanStateData{
PQExpBuffer output_buf; /* current output buffer */
StackElem *buffer_stack; /* stack of variable expansion buffers */
/*
* These variables always refer to the outer buffer, never to any stacked
* variable-expansion buffer.
*/
YY_BUFFER_STATE scanbufhandle;
char *scanbuf; /* start of outer-level input buffer */
const char *scanline; /* current input line at outer level */
/* safe_encoding, curline, refline are used by emit() to replace FFs */
int encoding; /* encoding being used now */
bool safe_encoding; /* is current encoding "safe"? */
bool std_strings; /* are string literals standard? */
const char *curline; /* actual flex input string for cur buf */
const char *refline; /* original data for cur buffer */
/*
* All this state lives across successive input lines, until explicitly
* reset by psql_scan_reset. start_state is adopted by yylex() on entry,
* and updated with its finishing state on exit.
*/
int postion_len; /* record the postion of the first symmetric char */ /* ReqID:SRS-SQL-ESCAPECHAR */
int start_state; /* yylex's starting/finishing state */
int state_before_str_stop; /* start cond. before end quote */
int paren_depth; /* depth of nesting in parentheses */
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
/*
* State to track boundaries of BEGIN ... END blocks in function
* definitions, so that semicolons do not send query too early.
*/
int identifier_count; /* identifiers since start of statement */
char identifiers[4]; /* records the first few identifiers */
int begin_depth; /* depth of begin/end pairs */
/* Begin - ReqID:SRS-SQL-PROC */
bool cancel_semicolon_terminator; /* not send command when semicolon found */
/*
* State to track boundaries of Oracle ANONYMOUS BLOCK.
* Case 1: Statements starting with << ident >> is Oracle anonymous block.
*/
int token_count; /* # of tokens, not blank or newline since start of statement */
bool anonymous_label_start; /* T if the first token is "<<" */
bool anonymous_label_ident; /* T if the second token is an identifier */
bool anonymous_label_end; /* T if the third token is ">>" */
bool xcstate; /* T If a C-style comment is being parsed */
/*
* Case 2: DECLARE BEGIN ... END is Oracle anonymous block sytax.
* DECLARE can also be a PostgreSQL cursor declaration statement, we need to distinguish it.
*/
bool maybe_anonymous_declare_start; /* T if the first token is DECLARE */
int token_cursor_idx; /* the position of keyword CURSOR in SQL statement */
/*
* Case 3: DECLARE BEGIN ... END is Oracle anonymous block sytax.
* BEGIN can also be a PostgreSQL transaction statement.
*/
bool maybe_anonymous_begin_start; /* T if the first token is BEGIN */
/* End - ReqID:SRS-SQL-PROC */
/*
* Callback functions provided by the program making use of the lexer,
* plus a void* callback passthrough argument.
*/
const PsqlScanCallbacks *callbacks;
const Ora_psqlScanCallbacks *oracallbacks; /* ReqID:SRS-PSQL-PARSER */
const Mys_psqlScanCallbacks *myscallbacks; /* ReqID:SRS-PSQL-PARSER */
void *cb_passthrough;
/* Begin - ReqID:SRS-CMD-PSQL */
/*
* literalbuf is used to accumulate literal values when multiple rules are
* needed to parse a single literal. Call startlit() to reset buffer to
* empty, addlit() to add text. NOTE: the string in literalbuf is NOT
* necessarily null-terminated, but there always IS room to add a trailing
* null at offset literallen. We store a null only when we need it.
*/
char *literalbuf; /* palloc'd expandable buffer */
int literallen; /* actual current string length */
int literalalloc; /* current allocated buffer size */
bool is_sqlplus_cmd; /* T if is a psqlplus command */
psqlplus_cmd *psqlpluscmd;
/* End - ReqID:SRS-CMD-PSQL */
} PsqlScanStateData;
一般的字段都会进入
{identifier} {
if (cur_state->identifier_count == 0)
memset(cur_state->identifiers, 0, sizeof(cur_state->identifiers));
//这里是开辟一个标识符存储空间
if (pg_strcasecmp(yytext, "create") == 0 || //这里是对pg里一些特殊语句处理的 比如 CREATE FUNCTION这种
pg_strcasecmp(yytext, "function") == 0 ||
pg_strcasecmp(yytext, "procedure") == 0 ||
pg_strcasecmp(yytext, "or") == 0 ||
pg_strcasecmp(yytext, "replace") == 0)
{
//下面是看是否是我们在gram.y里定义的 TOKEN (关键字)。像 我们所熟悉的 CREATE TABLE 这两个都是关键字。所以我们在gram中加token的时候 需要在kwlist里 同样添加一个token字段,这里访问的便是kwlist里的内容
if (cur_state->identifier_count < sizeof(cur_state->identifiers))
cur_state->identifiers[cur_state->identifier_count] = pg_tolower((unsigned char) yytext[0]);
}
cur_state->identifier_count++;
//下面也是对函数的一些处理 忽略即可 一般到上面的 关键字识别就已经完成了
if (cur_state->identifiers[0] == 'c' &&
(cur_state->identifiers[1] == 'f' || cur_state->identifiers[1] == 'p' ||
(cur_state->identifiers[1] == 'o' && cur_state->identifiers[2] == 'r' &&
(cur_state->identifiers[3] == 'f' || cur_state->identifiers[3] == 'p'))) &&
cur_state->paren_depth == 0)
{
...........}
//这个是把识别到的关键 返回给out_put
ECHO; 至此已经完成该token的识别 下面会继续 读取 下一个token 重复上述动作