postgreSQL源码：psql词法分析

残风

已于 2023-10-13 16:50:36 修改

阅读量166

点赞数

文章标签： postgresql 数据库

于 2023-10-13 15:13:12 首次发布

本文链接：https://blog.csdn.net/T153511/article/details/133810548

版权

1 什么是词法分析？

简单来说就是一个把字符串分割成一个个关键字或者标识符的过程。

基础知识

/* src/backend/parser/scan.l */代码注释：

/* 定义段 /
…
// %x 申明了排他的开始条件
%x xb / 二进制字符串；例如：bB’0101’ /
%x xc / C语言风格的comment；例如：/* comment * / /
%x xd / 使用双引号括起来的标识符；例如：“colname” /
%x xh / 十六进制字符串；例如：xX’FE5F’ /
%x xq / 基本的单引号字符串；例如：‘string’ /
%x xe / 扩展的单引号字符串，支持转义字符\ ；例如：‘string ’ string2’ /
%x xdolq / 采用 $符号括着的字符串, 例如：$ foo$ /
%x xui / 使用Unicode括起来的标识符，例如：uU"FE5F" /
%x xuiend / xui 的结束 /
%x xus / 使用Unicode括起来的字符串，例如：uU"FE5F" /
%x xusend / xus 的结束 /
%x xeu / xe里面的Unicode字符串例如：‘string \uD5EF string2’ */

space [ \t\n\r\f] // 空白字符
horiz_space [ \t\f]
newline [\n\r] // 换行字符
non_newline [^\n\r]
comment (“–”{non_newline}*) // 行末注释
whitespace ({space}+|{comment}) // 行末空白字符及注释

special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}{newline}{special_whitespace})

/单引号/
quote ’ /【取消’】/
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*“-”

/二进制字符串的开始、内部不能含有’ /
xbstart [bB]{quote}
xbinside [^'] /【取消’】*/

/十六进制字符串的开始、内部不能含有’ /
xhstart [xX]{quote}
xhinside [^'] /【取消’】*/

/*nchar字符串的开始 */
xnstart [nN]{quote}

/* 扩展的单引号字符串，支持转义字符\ /
xestart [eE]{quote}
xeinside [^\']+ /【取消’】*/
xeescape [\][^0-7]
xeoctesc [\][0-7]{1,3}
xehexesc [\]x[0-9A-Fa-f]{1,2}
xeunicode \
xeunicodefail \

/* 基本的单引号字符串，内部不能含有’ /
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+ /【取消’】*/

/* $f oo$ 样式的括号 /
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim $({dolq_start}{dolq_cont})?$
dolqfailed ${dolq_start}{dolq_cont}*
dolqinside [^$]+

/* 使用双引号括起来的标识符 /
dquote " /【取消"】/
xdstart {dquote}
xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+ /【取消"】*/

digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9$]

/* 标识符 /
identifier {ident_start}{ident_cont}

/* 操作符 */

equals_greater “=>”
less_equals “<=”
greater_equals “>=”
less_greater “<>”
not_equals “!=”

self [,()[].;:+-*/%^<>=]
op_chars [~!@#^&|`?+-*/%<>=]
operator {op_chars}+

integer {digit}+

other .

%%
/* 规则段 /
{whitespace} {
/ 忽略空白字符 /
}
/ comment 注释*/
{xcstart} {
/* comment 开始*/
SET_YYLLOC();
yyextra->xcdepth = 0;
BEGIN(xc); // 设置comment开始条件
/* Put back any characters past slash-star; see above */
yyless(2);
}

{xcstart} {
(yyextra->xcdepth)++; // comment层次加1
/* Put back any characters past slash-star; see above */
yyless(2);
}

{xcstop} {
if (yyextra->xcdepth <= 0)
BEGIN(INITIAL); // 退出comment条件
else
(yyextra->xcdepth)–; // comment层次减1
}

/* 基本的单引号字符串*/
{xqstart} {
yyextra->warn_on_first_escape = true;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
if (yyextra->standard_conforming_strings)
BEGIN(xq);
else
BEGIN(xe);
startlit();
}
/* 扩展的单引号字符串 */
{xestart} {
yyextra->warn_on_first_escape = false;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
BEGIN(xe);
startlit();
}

/结束符/
<> {
SET_YYLLOC();
yyterminate();
}

基本流程

首先基本函数

1 psqlscan_emit函数是psqlscan.l中的ECHO宏，负责匹配词法后，把数据整理到PsqlScanState->output_buf中。
2 基本所有的语法匹配完了都会调psqlscan_emit，所以调试语法树挂这个函数。

你从客户端输入一个语句：
在这里插入图片描述
在源码 mainloop.c文件中入口：
psql_scan(can_state, query_buf, &prompt_tmp);
1 会一行一行的读取，换句话说，如上图的语句他首先就会读入字符串“CREATE FUNCTION f1(a IN int) ”作为第一次处理。接着调用lex处理这一行字符串。
2 处理完毕才读入“RETURN int” 作为第二次又循环处理第一步步骤。
LEX 就是我们的scan.l 文件由于是psql端所以全名是psql_scan.l 文件

psql_scan.l 文件解析

说明：cur_state 是一个词法分析时的一个状态记录器会一直伴随你的词法分析。定义在：
psqlscan_int.h 各个字段定义和意义可以看注释内容：

Ptypedef struct PsqlScanStateData{ 

PQExpBuffer output_buf;		/* current output buffer */

StackElem  *buffer_stack;	/* stack of variable expansion buffers */

/*
 * These variables always refer to the outer buffer, never to any stacked
 * variable-expansion buffer.
 */
YY_BUFFER_STATE scanbufhandle;
char	   *scanbuf;		/* start of outer-level input buffer */
const char *scanline;		/* current input line at outer level */

/* safe_encoding, curline, refline are used by emit() to replace FFs */
int			encoding;		/* encoding being used now */
bool		safe_encoding;	/* is current encoding "safe"? */
bool		std_strings;	/* are string literals standard? */
const char *curline;		/* actual flex input string for cur buf */
const char *refline;		/* original data for cur buffer */

/*
 * All this state lives across successive input lines, until explicitly
 * reset by psql_scan_reset.  start_state is adopted by yylex() on entry,
 * and updated with its finishing state on exit.
 */
int			postion_len;	/* record the postion of the first symmetric char */	/* ReqID:SRS-SQL-ESCAPECHAR */
int			start_state;	/* yylex's starting/finishing state */
int			state_before_str_stop;	/* start cond. before end quote */
int			paren_depth;	/* depth of nesting in parentheses */
int			xcdepth;		/* depth of nesting in slash-star comments */
char	   *dolqstart;		/* current $foo$ quote start string */

/*
 * State to track boundaries of BEGIN ... END blocks in function
 * definitions, so that semicolons do not send query too early.
 */
int			identifier_count;	/* identifiers since start of statement */
char		identifiers[4]; /* records the first few identifiers */
int			begin_depth;	/* depth of begin/end pairs */

/* Begin - ReqID:SRS-SQL-PROC */
bool		cancel_semicolon_terminator; /* not send command when semicolon found */

/*
 * State to track boundaries of Oracle ANONYMOUS BLOCK.
 * Case 1: Statements starting with << ident >> is Oracle anonymous block.
 */
int			token_count;			/* # of tokens, not blank or newline since start of statement */
bool		anonymous_label_start;	/* T if the first token is "<<" */
bool		anonymous_label_ident;	/* T if the second token is an identifier */
bool		anonymous_label_end;	/* T if the third token is ">>" */
bool		xcstate;				/* T If a C-style comment is being parsed */

/*
 * Case 2: DECLARE BEGIN ... END is Oracle anonymous block sytax.
 * DECLARE can also be a PostgreSQL cursor declaration statement, we need to distinguish it.
 */
bool		maybe_anonymous_declare_start;	/* T if the first token is DECLARE */
int			token_cursor_idx;				/* the position of keyword CURSOR in SQL statement */

/*
 * Case 3: DECLARE BEGIN ... END is Oracle anonymous block sytax.
 * BEGIN can also be a PostgreSQL transaction statement.
 */
bool		maybe_anonymous_begin_start;	/* T if the first token is BEGIN */
/* End - ReqID:SRS-SQL-PROC */

/*
 * Callback functions provided by the program making use of the lexer,
 * plus a void* callback passthrough argument.
 */
const PsqlScanCallbacks *callbacks;
const Ora_psqlScanCallbacks *oracallbacks;	/* ReqID:SRS-PSQL-PARSER */
const Mys_psqlScanCallbacks *myscallbacks;	/* ReqID:SRS-PSQL-PARSER */
void	   *cb_passthrough;

/* Begin - ReqID:SRS-CMD-PSQL */
/*
 * literalbuf is used to accumulate literal values when multiple rules are
 * needed to parse a single literal.  Call startlit() to reset buffer to
 * empty, addlit() to add text.  NOTE: the string in literalbuf is NOT
 * necessarily null-terminated, but there always IS room to add a trailing
 * null at offset literallen.  We store a null only when we need it.
 */
char	   *literalbuf;		/* palloc'd expandable buffer */
int			literallen;		/* actual current string length */
int			literalalloc;	/* current allocated buffer size */

bool			is_sqlplus_cmd;	/* T if is a psqlplus command */
psqlplus_cmd	*psqlpluscmd;
/* End - ReqID:SRS-CMD-PSQL */

} PsqlScanStateData;
一般的字段都会进入

 {identifier}	{
				if (cur_state->identifier_count == 0)
					memset(cur_state->identifiers, 0, sizeof(cur_state->identifiers));
					//这里是开辟一个标识符存储空间

				if (pg_strcasecmp(yytext, "create") == 0 || //这里是对pg里一些特殊语句处理的 比如 CREATE FUNCTION这种
					pg_strcasecmp(yytext, "function") == 0 ||
					pg_strcasecmp(yytext, "procedure") == 0 ||
					pg_strcasecmp(yytext, "or") == 0 ||
					pg_strcasecmp(yytext, "replace") == 0)
				{
				//下面是看是否是我们在gram.y里定义的 TOKEN （关键字）。像 我们所熟悉的 CREATE TABLE  这两个都是关键字。所以我们在gram中加token的时候 需要在kwlist里 同样添加一个token字段，这里访问的便是kwlist里的内容
					if (cur_state->identifier_count < sizeof(cur_state->identifiers))
						cur_state->identifiers[cur_state->identifier_count] = pg_tolower((unsigned char) yytext[0]);
				}

				cur_state->identifier_count++;
				//下面也是对函数的一些处理 忽略即可 一般到上面的 关键字识别就已经完成了

				if (cur_state->identifiers[0] == 'c' &&
					(cur_state->identifiers[1] == 'f' || cur_state->identifiers[1] == 'p' ||
					 (cur_state->identifiers[1] == 'o' && cur_state->identifiers[2] == 'r' &&
					  (cur_state->identifiers[3] == 'f' || cur_state->identifiers[3] == 'p'))) &&
					cur_state->paren_depth == 0)
				{
				...........}
				//这个是把识别到的关键 返回给out_put
				ECHO； 至此已经完成该token的识别 下面会继续 读取 下一个token 重复上述动作