postgreSQL源码:psql词法分析

1 什么是词法分析?

简单来说 就是一个把字符串分割成一个个 关键字或者标识符的过程。

基础知识

/* src/backend/parser/scan.l */代码注释:

/* 定义段 /

// %x 申明了排他的开始条件
%x xb /
二进制字符串;例如:bB’0101’ /
%x xc /
C语言风格的comment;例如:/* comment * / /
%x xd /
使用双引号括起来的标识符;例如:“colname” /
%x xh /
十六进制字符串;例如:xX’FE5F’ /
%x xq /
基本的单引号字符串;例如:‘string’ /
%x xe /
扩展的单引号字符串,支持转义字符\ ;例如:‘string ’ string2’ /
%x xdolq /
采用 符号括着的字符串 , 例如: 符号括着的字符串,例如: 符号括着的字符串,例如:foo$ /
%x xui /
使用Unicode括起来的标识符,例如:uU"FE5F" /
%x xuiend /
xui 的结束 /
%x xus /
使用Unicode括起来的字符串,例如:uU"FE5F" /
%x xusend /
xus 的结束 /
%x xeu /
xe里面的Unicode字符串 例如:‘string \uD5EF string2’ */

space [ \t\n\r\f] // 空白字符
horiz_space [ \t\f]
newline [\n\r] // 换行字符
non_newline [^\n\r]
comment (“–”{non_newline}*) // 行末注释
whitespace ({space}+|{comment}) // 行末空白字符及注释

special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}{newline}{special_whitespace})

/单引号/
quote ’ /【取消’】/
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*“-”

/二进制字符串的开始、内部不能含有’ /
xbstart [bB]{quote}
xbinside [^']
/
【取消’】*/

/十六进制字符串的开始、内部不能含有’ /
xhstart [xX]{quote}
xhinside [^']
/
【取消’】*/

/*nchar字符串的开始 */
xnstart [nN]{quote}

/* 扩展的单引号字符串,支持转义字符\ /
xestart [eE]{quote}
xeinside [^\']+ /
【取消’】*/
xeescape [\][^0-7]
xeoctesc [\][0-7]{1,3}
xehexesc [\]x[0-9A-Fa-f]{1,2}
xeunicode \
xeunicodefail \

/* 基本的单引号字符串,内部不能含有’ /
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+ /
【取消’】*/

/* f o o foo foo 样式的括号 /
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim $({dolq_start}{dolq_cont}
)?$
dolqfailed ${dolq_start}{dolq_cont}*
dolqinside [^$]+

/* 使用双引号括起来的标识符 /
dquote " /
【取消"】/
xdstart {dquote}
xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+ /
【取消"】*/

digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9$]

/* 标识符 /
identifier {ident_start}{ident_cont}

/* 操作符 */

equals_greater “=>”
less_equals “<=”
greater_equals “>=”
less_greater “<>”
not_equals “!=”

self [,()[].;:+-*/%^<>=]
op_chars [~!@#^&|`?+-*/%<>=]
operator {op_chars}+

integer {digit}+

other .

%%
/* 规则段 /
{whitespace} {
/
忽略空白字符 /
}
/
comment 注释*/
{xcstart} {
/* comment 开始*/
SET_YYLLOC();
yyextra->xcdepth = 0;
BEGIN(xc); // 设置comment开始条件
/* Put back any characters past slash-star; see above */
yyless(2);
}

{xcstart} {
(yyextra->xcdepth)++; // comment层次加1
/* Put back any characters past slash-star; see above */
yyless(2);
}

{xcstop} {
if (yyextra->xcdepth <= 0)
BEGIN(INITIAL); // 退出comment条件
else
(yyextra->xcdepth)–; // comment层次减1
}

/* 基本的单引号字符串*/
{xqstart} {
yyextra->warn_on_first_escape = true;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
if (yyextra->standard_conforming_strings)
BEGIN(xq);
else
BEGIN(xe);
startlit();
}
/* 扩展的单引号字符串 */
{xestart} {
yyextra->warn_on_first_escape = false;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
BEGIN(xe);
startlit();
}

/结束符/
<> {
SET_YYLLOC();
yyterminate();
}

基本流程

首先基本函数

1 psqlscan_emit函数是psqlscan.l中的ECHO宏,负责匹配词法后,把数据整理到PsqlScanState->output_buf中。
2 基本所有的语法匹配完了都会调psqlscan_emit,所以调试语法树挂这个函数。

你从客户端输入一个语句:
在这里插入图片描述
在源码 mainloop.c文件中入口:
psql_scan(can_state, query_buf, &prompt_tmp);
1 会一行一行的读取,换句话说,如上图的语句 他首先就会读入字符串“CREATE FUNCTION f1(a IN int) ”作为 第一次处理。接着调用lex处理这一行字符串。
2 处理完毕 才读入“RETURN int” 作为 第二次又循环处理 第一步步骤。
LEX 就是我们的scan.l 文件 由于是psql端 所以全名是psql_scan.l 文件

psql_scan.l 文件解析

说明:cur_state 是一个词法分析时的一个状态 记录器 会一直伴随你的词法分析。定义在:
psqlscan_int.h 各个字段定义 和意义可以看注释内容:

Ptypedef struct PsqlScanStateData{ 

PQExpBuffer output_buf;		/* current output buffer */

StackElem  *buffer_stack;	/* stack of variable expansion buffers */

/*
 * These variables always refer to the outer buffer, never to any stacked
 * variable-expansion buffer.
 */
YY_BUFFER_STATE scanbufhandle;
char	   *scanbuf;		/* start of outer-level input buffer */
const char *scanline;		/* current input line at outer level */

/* safe_encoding, curline, refline are used by emit() to replace FFs */
int			encoding;		/* encoding being used now */
bool		safe_encoding;	/* is current encoding "safe"? */
bool		std_strings;	/* are string literals standard? */
const char *curline;		/* actual flex input string for cur buf */
const char *refline;		/* original data for cur buffer */

/*
 * All this state lives across successive input lines, until explicitly
 * reset by psql_scan_reset.  start_state is adopted by yylex() on entry,
 * and updated with its finishing state on exit.
 */
int			postion_len;	/* record the postion of the first symmetric char */	/* ReqID:SRS-SQL-ESCAPECHAR */
int			start_state;	/* yylex's starting/finishing state */
int			state_before_str_stop;	/* start cond. before end quote */
int			paren_depth;	/* depth of nesting in parentheses */
int			xcdepth;		/* depth of nesting in slash-star comments */
char	   *dolqstart;		/* current $foo$ quote start string */

/*
 * State to track boundaries of BEGIN ... END blocks in function
 * definitions, so that semicolons do not send query too early.
 */
int			identifier_count;	/* identifiers since start of statement */
char		identifiers[4]; /* records the first few identifiers */
int			begin_depth;	/* depth of begin/end pairs */

/* Begin - ReqID:SRS-SQL-PROC */
bool		cancel_semicolon_terminator; /* not send command when semicolon found */

/*
 * State to track boundaries of Oracle ANONYMOUS BLOCK.
 * Case 1: Statements starting with << ident >> is Oracle anonymous block.
 */
int			token_count;			/* # of tokens, not blank or newline since start of statement */
bool		anonymous_label_start;	/* T if the first token is "<<" */
bool		anonymous_label_ident;	/* T if the second token is an identifier */
bool		anonymous_label_end;	/* T if the third token is ">>" */
bool		xcstate;				/* T If a C-style comment is being parsed */

/*
 * Case 2: DECLARE BEGIN ... END is Oracle anonymous block sytax.
 * DECLARE can also be a PostgreSQL cursor declaration statement, we need to distinguish it.
 */
bool		maybe_anonymous_declare_start;	/* T if the first token is DECLARE */
int			token_cursor_idx;				/* the position of keyword CURSOR in SQL statement */

/*
 * Case 3: DECLARE BEGIN ... END is Oracle anonymous block sytax.
 * BEGIN can also be a PostgreSQL transaction statement.
 */
bool		maybe_anonymous_begin_start;	/* T if the first token is BEGIN */
/* End - ReqID:SRS-SQL-PROC */

/*
 * Callback functions provided by the program making use of the lexer,
 * plus a void* callback passthrough argument.
 */
const PsqlScanCallbacks *callbacks;
const Ora_psqlScanCallbacks *oracallbacks;	/* ReqID:SRS-PSQL-PARSER */
const Mys_psqlScanCallbacks *myscallbacks;	/* ReqID:SRS-PSQL-PARSER */
void	   *cb_passthrough;

/* Begin - ReqID:SRS-CMD-PSQL */
/*
 * literalbuf is used to accumulate literal values when multiple rules are
 * needed to parse a single literal.  Call startlit() to reset buffer to
 * empty, addlit() to add text.  NOTE: the string in literalbuf is NOT
 * necessarily null-terminated, but there always IS room to add a trailing
 * null at offset literallen.  We store a null only when we need it.
 */
char	   *literalbuf;		/* palloc'd expandable buffer */
int			literallen;		/* actual current string length */
int			literalalloc;	/* current allocated buffer size */

bool			is_sqlplus_cmd;	/* T if is a psqlplus command */
psqlplus_cmd	*psqlpluscmd;
/* End - ReqID:SRS-CMD-PSQL */

} PsqlScanStateData;
一般的字段都会进入

 {identifier}	{
				if (cur_state->identifier_count == 0)
					memset(cur_state->identifiers, 0, sizeof(cur_state->identifiers));
					//这里是开辟一个标识符存储空间

				if (pg_strcasecmp(yytext, "create") == 0 || //这里是对pg里一些特殊语句处理的 比如 CREATE FUNCTION这种
					pg_strcasecmp(yytext, "function") == 0 ||
					pg_strcasecmp(yytext, "procedure") == 0 ||
					pg_strcasecmp(yytext, "or") == 0 ||
					pg_strcasecmp(yytext, "replace") == 0)
				{
				//下面是看是否是我们在gram.y里定义的 TOKEN (关键字)。像 我们所熟悉的 CREATE TABLE  这两个都是关键字。所以我们在gram中加token的时候 需要在kwlist里 同样添加一个token字段,这里访问的便是kwlist里的内容
					if (cur_state->identifier_count < sizeof(cur_state->identifiers))
						cur_state->identifiers[cur_state->identifier_count] = pg_tolower((unsigned char) yytext[0]);
				}

				cur_state->identifier_count++;
				//下面也是对函数的一些处理 忽略即可 一般到上面的 关键字识别就已经完成了

				if (cur_state->identifiers[0] == 'c' &&
					(cur_state->identifiers[1] == 'f' || cur_state->identifiers[1] == 'p' ||
					 (cur_state->identifiers[1] == 'o' && cur_state->identifiers[2] == 'r' &&
					  (cur_state->identifiers[3] == 'f' || cur_state->identifiers[3] == 'p'))) &&
					cur_state->paren_depth == 0)
				{
				...........}
				//这个是把识别到的关键 返回给out_put
				ECHO; 至此已经完成该token的识别 下面会继续 读取 下一个token 重复上述动作
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值