Postgres数据库之词法分析阶段关键字相关处理学习汇总

最新推荐文章于 2024-03-29 10:10:51 发布

丶Summer ~Z

最新推荐文章于 2024-03-29 10:10:51 发布

阅读量214

点赞数

文章标签：数据库学习

本文链接：https://blog.csdn.net/qq_40310161/article/details/129954564

版权

Postgres数据库之词法分析阶段关键字相关处理学习汇总

关键字文件、接口、数据结构

关键字文件、接口、数据结构

当后端接受到来自客户端的sql命令之后，调用词法解析进行处理，本次主要介绍词法阶段对关键字的处理流程
关键字相关的文件如下：
定义关键字kwlist.h：

/*-------------------------------------------------------------------------
 *
 * kwlist.h
 *
 * The keyword lists are kept in their own source files for use by
 * automatic tools.  The exact representation of a keyword is determined
 * by the PG_KEYWORD macro, which is not defined in this file; it can
 * be defined by the caller for special purposes.
 *
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *	  src/include/parser/kwlist.h
 *
 *-------------------------------------------------------------------------
 */

/* there is deliberately not an #ifndef KWLIST_H here */

/*
 * List of keyword (name, token-value, category) entries.
 *
 * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
 */

/* name, value, category */
PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("absolute", ABSOLUTE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("access", ACCESS, UNRESERVED_KEYWORD)
PG_KEYWORD("action", ACTION, UNRESERVED_KEYWORD)
PG_KEYWORD("add", ADD_P, UNRESERVED_KEYWORD)
PG_KEYWORD("admin", ADMIN, UNRESERVED_KEYWORD)
PG_KEYWORD("after", AFTER, UNRESERVED_KEYWORD)
PG_KEYWORD("aggregate", AGGREGATE, UNRESERVED_KEYWORD)
PG_KEYWORD("all", ALL, RESERVED_KEYWORD)
PG_KEYWORD("also", ALSO, UNRESERVED_KEYWORD)
PG_KEYWORD("alter", ALTER, UNRESERVED_KEYWORD)
PG_KEYWORD("always", ALWAYS, UNRESERVED_KEYWORD)
PG_KEYWORD("analyse", ANALYSE, RESERVED_KEYWORD)		/* British spelling */
PG_KEYWORD("analyze", ANALYZE, RESERVED_KEYWORD)
PG_KEYWORD("and", AND, RESERVED_KEYWORD)
PG_KEYWORD("any", ANY, RESERVED_KEYWORD)
PG_KEYWORD("array", ARRAY, RESERVED_KEYWORD)
PG_KEYWORD("as", AS, RESERVED_KEYWORD)
PG_KEYWORD("asc", ASC, RESERVED_KEYWORD)
...

查找关键字接口定义文件kwlookup.h

/*-------------------------------------------------------------------------
 *
 * kwlookup.h
 *	  Key word lookup for PostgreSQL
 *
 *
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/kwlookup.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef KWLOOKUP_H
#define KWLOOKUP_H

/* Hash function used by ScanKeywordLookup */
typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);

/*
 * This struct contains the data needed by ScanKeywordLookup to perform a
 * search within a set of keywords.  The contents are typically generated by
 * src/tools/gen_keywordlist.pl from a header containing PG_KEYWORD macros.
 */
typedef struct ScanKeywordList
{
	const char *kw_string;		/* all keywords in order, separated by \0 */
	const uint16 *kw_offsets;	/* offsets to the start of each keyword */
	ScanKeywordHashFunc hash;	/* perfect hash function for keywords */
	int			num_keywords;	/* number of keywords */
	int			max_kw_len;		/* length of longest keyword */
} ScanKeywordList;


extern int	ScanKeywordLookup(const char *text, const ScanKeywordList *keywords);

/* Code that wants to retrieve the text of the N'th keyword should use this. */
static inline const char *
GetScanKeyword(int n, const ScanKeywordList *keywords)
{
	return keywords->kw_string + keywords->kw_offsets[n];
}

#endif							/* KWLOOKUP_H */

查找关键字接口的实现函数ScanKeywordLookup

/*-------------------------------------------------------------------------
 *
 * kwlookup.c
 *	  Key word lookup for PostgreSQL
 *
 *
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/common/kwlookup.c
 *
 *-------------------------------------------------------------------------
 */
#include "c.h"

#include "common/kwlookup.h"


/*
 * ScanKeywordLookup - see if a given word is a keyword
 *
 * The list of keywords to be matched against is passed as a ScanKeywordList.
 *
 * Returns the keyword number (0..N-1) of the keyword, or -1 if no match.
 * Callers typically use the keyword number to index into information
 * arrays, but that is no concern of this code.
 *
 * The match is done case-insensitively.  Note that we deliberately use a
 * dumbed-down case conversion that will only translate 'A'-'Z' into 'a'-'z',
 * even if we are in a locale where tolower() would produce more or different
 * translations.  This is to conform to the SQL99 spec, which says that
 * keywords are to be matched in this way even though non-keyword identifiers
 * receive a different case-normalization mapping.
 */
int
ScanKeywordLookup(const char *str,
				  const ScanKeywordList *keywords)
{
	size_t		len;
	int			h;
	const char *kw;

	/*
	 * Reject immediately if too long to be any keyword.  This saves useless
	 * hashing and downcasing work on long strings.
	 */
	len = strlen(str);
	if (len > keywords->max_kw_len)
		return -1;

	/*
	 * Compute the hash function.  We assume it was generated to produce
	 * case-insensitive results.  Since it's a perfect hash, we need only
	 * match to the specific keyword it identifies.
	 */
	h = keywords->hash(str, len);

	/* An out-of-range result implies no match */
	if (h < 0 || h >= keywords->num_keywords)
		return -1;

	/*
	 * Compare character-by-character to see if we have a match, applying an
	 * ASCII-only downcasing to the input characters.  We must not use
	 * tolower() since it may produce the wrong translation in some locales
	 * (eg, Turkish).
	 */
	kw = GetScanKeyword(h, keywords);
	while (*str != '\0')
	{
		char		ch = *str++;

		if (ch >= 'A' && ch <= 'Z')
			ch += 'a' - 'A';
		if (ch != *kw++)
			return -1;
	}
	if (*kw != '\0')
		return -1;

	/* Success! */
	return h;
}

编译关键字脚本gen_keywordlist.pl

#----------------------------------------------------------------------
#
# gen_keywordlist.pl
#	Perl script that transforms a list of keywords into a ScanKeywordList
#	data structure that can be passed to ScanKeywordLookup().
#
# The input is a C header file containing a series of macro calls
#	PG_KEYWORD("keyword", ...)
# Lines not starting with PG_KEYWORD are ignored.  The keywords are
# implicitly numbered 0..N-1 in order of appearance in the header file.
# Currently, the keywords are required to appear in ASCII order.
#
# The output is a C header file that defines a "const ScanKeywordList"
# variable named according to the -v switch ("ScanKeywords" by default).
# The variable is marked "static" unless the -e switch is given.
#
# ScanKeywordList uses hash-based lookup, so this script also selects
# a minimal perfect hash function for the keyword set, and emits a
# static hash function that is referenced in the ScanKeywordList struct.
# The hash function is case-insensitive unless --no-case-fold is specified.
# Note that case folding works correctly only for all-ASCII keywords!
#
#
# Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/tools/gen_keywordlist.pl
#
#----------------------------------------------------------------------

use strict;
use warnings;
use Getopt::Long;

use FindBin;
use lib $FindBin::RealBin;

use PerfectHash;

my $output_path = '';
my $extern      = 0;
my $case_fold   = 1;
my $varname     = 'ScanKeywords';

GetOptions(
	'output:s'   => \$output_path,
	'extern'     => \$extern,
	'case-fold!' => \$case_fold,
	'varname:s'  => \$varname) || usage();

my $kw_input_file = shift @ARGV || die "No input file.\n";

# Make sure output_path ends in a slash if needed.
if ($output_path ne '' && substr($output_path, -1) ne '/')
{
	$output_path .= '/';
}

$kw_input_file =~ /(\w+)\.h$/
  || die "Input file must be named something.h.\n";
my $base_filename = $1 . '_d';
my $kw_def_file   = $output_path . $base_filename . '.h';

open(my $kif,   '<', $kw_input_file) || die "$kw_input_file: $!\n";
open(my $kwdef, '>', $kw_def_file)   || die "$kw_def_file: $!\n";

# Opening boilerplate for keyword definition header.
printf $kwdef <<EOM, $base_filename, uc $base_filename, uc $base_filename;
/*-------------------------------------------------------------------------
 *
 * %s.h
 *    List of keywords represented as a ScanKeywordList.
 *
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * NOTES
 *  ******************************
 *  *** DO NOT EDIT THIS FILE! ***
 *  ******************************
 *
 *  It has been GENERATED by src/tools/gen_keywordlist.pl
 *
 *-------------------------------------------------------------------------
 */

#ifndef %s_H
#define %s_H

#include "common/kwlookup.h"

EOM

# Parse input file for keyword names.
my @keywords;
while (<$kif>)
{
	if (/^PG_KEYWORD\("(\w+)"/)
	{
		push @keywords, $1;
	}
}

# When being case-insensitive, insist that the input be all-lower-case.
if ($case_fold)
{
	foreach my $kw (@keywords)
	{
		die qq|The keyword "$kw" is not lower-case in $kw_input_file\n|
		  if ($kw ne lc $kw);
	}
}

# Error out if the keyword names are not in ASCII order.
#
# While this isn't really necessary with hash-based lookup, it's still
# helpful because it provides a cheap way to reject duplicate keywords.
# Also, insisting on sorted order ensures that code that scans the keyword
# table linearly will see the keywords in a canonical order.
for my $i (0 .. $#keywords - 1)
{
	die
	  qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n|
	  if ($keywords[$i] cmp $keywords[ $i + 1 ]) >= 0;
}

# Emit the string containing all the keywords.

printf $kwdef qq|static const char %s_kw_string[] =\n\t"|, $varname;
print $kwdef join qq|\\0"\n\t"|, @keywords;
print $kwdef qq|";\n\n|;

# Emit an array of numerical offsets which will be used to index into the
# keyword string.  Also determine max keyword length.

printf $kwdef "static const uint16 %s_kw_offsets[] = {\n", $varname;

my $offset  = 0;
my $max_len = 0;
foreach my $name (@keywords)
{
	my $this_length = length($name);

	print $kwdef "\t$offset,\n";

	# Calculate the cumulative offset of the next keyword,
	# taking into account the null terminator.
	$offset += $this_length + 1;

	# Update max keyword length.
	$max_len = $this_length if $max_len < $this_length;
}

print $kwdef "};\n\n";

# Emit a macro defining the number of keywords.
# (In some places it's useful to have access to that as a constant.)

printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;

# Emit the definition of the hash function.

my $funcname = $varname . "_hash_func";

my $f = PerfectHash::generate_hash_function(\@keywords, $funcname,
	case_fold => $case_fold);

printf $kwdef qq|static %s\n|, $f;

# Emit the struct that wraps all this lookup info into one variable.

printf $kwdef "static " if !$extern;
printf $kwdef "const ScanKeywordList %s = {\n", $varname;
printf $kwdef qq|\t%s_kw_string,\n|,            $varname;
printf $kwdef qq|\t%s_kw_offsets,\n|,           $varname;
printf $kwdef qq|\t%s,\n|,                      $funcname;
printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|,         uc $varname;
printf $kwdef qq|\t%d\n|,                       $max_len;
printf $kwdef "};\n\n";

printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;


sub usage
{
	die <<EOM;
Usage: gen_keywordlist.pl [--output/-o <path>] [--varname/-v <varname>] [--extern/-e] [--[no-]case-fold] input_file
    --output        Output directory (default '.')
    --varname       Name for ScanKeywordList variable (default 'ScanKeywords')
    --extern        Allow the ScanKeywordList variable to be globally visible
    --no-case-fold  Keyword matching is to be case-sensitive

gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList.
The output filename is derived from the input file by inserting _d,
for example kwlist_d.h is produced from kwlist.h.
EOM
}

词法分析阶段，对输入的字符串调用ScanKeywordLookup接口判断是否为关键字，该接口调用关键字对应的hash函数（ScanKeywords_hash_func）计算给定字符串的hash值，之后调用GetScanKeyword接口，根据hash值获取到对应的关键字，（在对字符串计算hash值时，会将每个字符转成小写）之后对给定的字符串与获取到的关键字字符串（小写）进行比较，如果相等，则返回给定字符串对应的hash值，否则返回-1；从关键字字符串中获取到的是小写的关键字，因此整个搜索关键字的过程是不区分大小写的
在这里插入图片描述

/home/postgres/postgres/src/common/kwlist_d.h
该文件通过gen_keywordlist.pl脚本读取src/include/parser/kwlist.h文件生成,对应makefile规则如下：

#-------------------------------------------------------------------------
#
# Makefile
#    Makefile for src/common
#
# These files are used by the Postgres backend, and also by frontend
# programs.  These files provide common functionality that isn't directly
# concerned with portability and thus doesn't belong in src/port.
#
# This makefile generates three outputs:
#
#	libpgcommon.a - contains object files with FRONTEND defined,
#		for use by client applications
#
#	libpgcommon_shlib.a - contains object files with FRONTEND defined,
#		built suitably for use in shared libraries; for use
#		by frontend libraries
#
#	libpgcommon_srv.a - contains object files without FRONTEND defined,
#		for use only by the backend
#
# IDENTIFICATION
#    src/common/Makefile
#
#-------------------------------------------------------------------------

subdir = src/common
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
...
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm

all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a

distprep: kwlist_d.h
...
# generate SQL keyword lookup table to be included into keywords*.o.
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
	$(GEN_KEYWORDLIST) --extern $<

# Dependencies of keywords*.o need to be managed explicitly to make sure
# that you don't get broken parsing code, even in a non-enable-depend build.
keywords.o keywords_shlib.o keywords_srv.o: kwlist_d.h
# The code imported from Ryu gets a pass on declaration-after-statement,
# in order to keep it more closely aligned with its upstream.
RYU_FILES = d2s.o f2s.o
RYU_OBJS = $(RYU_FILES) $(RYU_FILES:%.o=%_shlib.o) $(RYU_FILES:%.o=%_srv.o)

$(RYU_OBJS): CFLAGS += $(PERMIT_DECLARATION_AFTER_STATEMENT)

# kwlist_d.h is in the distribution tarball, so it is not cleaned here.
clean distclean:
	rm -f libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
	rm -f $(OBJS_FRONTEND) $(OBJS_SHLIB) $(OBJS_SRV)

maintainer-clean: distclean
	rm -f kwlist_d.h

kwlist_d.h文件主要包含了两个数组和用于计算关键字对应的hash值的函数（ScanKeywords_hash_func）
数组ScanKeywords_kw_string：对应kwlist.h定义的所有关键字字符串
数组ScanKeywords_kw_offsets：每个关键字字符串对应的hash值，对应的该关键字在ScanKeywords_kw_string数组中的偏移量