【编译原理】实验一：实现简易词法分析器

最新推荐文章于 2024-04-07 20:03:53 发布

社恐患者

最新推荐文章于 2024-04-07 20:03:53 发布

阅读量3.3k

点赞数 12

分类专栏：编译原理文章标签： c语言编译器

本文链接：https://blog.csdn.net/qq_44714521/article/details/106950654

版权

编译原理专栏收录该内容

11 篇文章 8 订阅

订阅专栏

C语言实现

C语言子集的单词符号及内码值

所设计的输出单词符号二元式的组成为：

(种别编码,单词自身的值)

例如(1,while)、(7,123)、(6,hello)。
宏定义助记符id、num、relop。

单词符号	种别编码	助记符	内码值
while	1	while	–
if	2	if	–
else	3	else	–
switch	4	switch	–
case	5	case	–
标识符	6	id	id在符号表中的位置
常数	7	num	num在常数表中的位置
+	8	+	–
-	9	-	–
*	10	*	–
<=	11	relop	LE
<	11	relop	LT
==	11	relop	EQ
=	12	=	–
;	13	;	–

实验环境

windows 10
visual studio 2019

测试文件test.txt

if atest=1
	hello world!;
else
	d=d*atest-1;
while btest<=3
	atest+c=1213;
switch syx
	case 1	atest<btest;
	case 2	atest==btest;

函数说明

代码主要基于《编译原理教程（第四版）》¹ 第二章词法分析（p14）实现，尽量去贴合书上所给例题。

0 变量说明

character		字符变量，存放最新读入的源程序字符
token			字符数组，存放构成单词符号的字符串

1 对输入串进行预处理

主要对输入串进行剔除多余的空白符操作。调用getbe()函数，若character中的字符为空白，则调用getchar_syx()函数（书上为getchar()），直至character为非空白符为止。

/*****************************************************************************
 *函数名称：getbe()
 *函数类型：void
 *参数：void
 *功能：滤白。若character中的字符为空白，则调用getchar_syx()，直至character为非空白字符为止
*****************************************************************************/
void getbe() {
	while (character == ' ')
		getchar_syx();
}

其中，getchar_syx()用来逐个读取文件中的字符。

/*****************************************************************************
 *函数名称：getchar_syx()
 *函数类型：void
 *参数：void
 *功能：逐个读取文件中的字符
 *****************************************************************************/
void getchar_syx() {
	if (fin)
		character = fgetc(fin);
}

2 将读入的字符链接成字符串保存在token中

调用concatenation()函数，将token中的字符串与character中的字符连接并作为token中新的字符串。

/*****************************************************************************
 *函数名称：concatenation()
 *函数类型：void
 *参数：void
 *功能：将token中的字符串与character中的字符连接并作为token中新的字符串
 *****************************************************************************/
void concatenation() {
	token[token_point] = character;
	token_point++;
	token[token_point] = '\0';//字符串结束标志
}

3 判断character中的字符是否为字母或数字

调用letter()函数和digit()函数，判断character中的字符是否为字母和数字的布尔函数，是则返回true（即1），否则返回false（即0）。

/*****************************************************************************
 *函数名称：letter()
 *函数类型：_Bool
 *参数：void
 *功能：判断character中的字符是否为字母，是则返回true，否则返回false
 *****************************************************************************/
_Bool letter() {
	if ((character >= 'a') && (character <= 'z') || (character >= 'A') && (character <= 'Z'))
		return 1;
	else
		return 0;
}

/*****************************************************************************
 *函数名称：digit()
 *函数类型：_Bool
 *参数：void
 *功能：判断character中的字符是否为数字，是则返回true，否则返回false
 *****************************************************************************/
_Bool digit() {
	if ((character >= '0') && (character <= '9'))
		return 1;
	else
		return 0;
}

4 判断token中的字符串是否为保留字

调用reserve()函数，按token数组中的字符串查保留字表，若是保留字的话，返回它的编码值，否则返回0值。
这里是定义了一个保留字表，存储保留字名称，用数组下标标识其编码值。

/*****************************************************************************
 *函数名称：reserve()
 *函数类型：int
 *参数：void
 *功能：判别token数组中的字符串是否为保留字，若为保留字则返回它的编码，否则返回0值
 *****************************************************************************/
int reserve() {
	for (int i = 1; i < reserved_Len; i++) {
		if (strcmp(token, reserved_Word[i]) == 0) {
			return i;
		}
	}
	return 0;
}

5 指针回退

调用retract()函数，扫描指针回退一个字符，同时将character置为空白。

/*****************************************************************************
 *函数名称：retract()
 *函数类型：void
 *参数：void
 *功能：扫描指针回退一个字符，同时将character置为空白
 *****************************************************************************/
void retract() {
	fseek(fin, -1, SEEK_CUR);
	character = ' ';
}

6 标识符\常数登记入表

调用buildlist()函数，将标识符登记到符号表中或将常数登记到常数表中。
（1）如果是常数，需要将字符串转换成数字（利用atoi函数）。
（2）无论是标识符还是常数，在登记入表时都需要查表看看表中是否已有该项了，如果表中没有项，才添加进表。

/*****************************************************************************
 *函数名称：buildlist()
 *函数类型：void
 *参数：void
 *功能：将标识符登记到符号表中或将常数登记到常数表中
 *****************************************************************************/
void buildlist() {
	char temp = token[0] - '0';//用于判断当前token中的字符串是标识符还是常数
	/************************将常数登记到常数表中*****************************/
	int temp_num = 0;//存放转换后的常数值
	if (temp >= 0 && temp <= 9) {//如果是数字
		temp_num = atoi(token);//利用atoi函数将token中的字符串转换成常数

		int i;
		for (i = 0; i < number_point; i++) {
			if (temp_num == number_table[i]) {//如果常数表中已存在该常数
				present_point = i;
				break;
			}
		}
		if (i == number_point) {//如果常数表中没有该常数，则登记该常数
			number_table[number_point] = temp_num;
			present_point = number_point;
			number_point++;
		}
	}
	/************************将标识符登记到常数表中*****************************/
	else {
		int i = 0;
		for (i; i < id_point; i++) {
			if (strcmp(token, identifier_table[i]) == 0) {//如果标识符表中已存在该字符串
				present_point = i;
				break;
			}
		}
		if (i == id_point) {//如果标识符表中没有该字符串，则登记该标识符
			strcpy_s(identifier_table[i],sizeof(token),token);
			present_point = id_point;
			id_point++;
		}
	}
}

7 词法分析

唔，尽量贴合伪代码去写的。只是实验一，虽然后来课设也用到了，比较菜就看看吧。

/*****************************************************************************
 *函数名称：test()
 *函数类型：void
 *参数：void
 *功能：词法分析
 *****************************************************************************/
void test() {
	//strcpy_s(token, sizeof(token), " ");	//对token数组初始化
	token_point = 0;						//token指针初始化
	getchar_syx();							//获取字符
	getbe();  								//滤除空格
	switch (character) {
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'e':
	case 'f':
	case 'g':
	case 'h':
	case 'i':
	case 'j':
	case 'k':
	case 'l':
	case 'm':
	case 'n':
	case 'o':
	case 'p':
	case 'q':
	case 'r':
	case 's':
	case 't':
	case 'u':
	case 'v':
	case 'w':
	case 'x':
	case 'y':
	case 'z':
		/*字母开头，为标识符/保留字*/
		while (letter() || digit()) {
			/*标识符包括字母和数字*/
			concatenation();    //将当前读入的字符送入token数组
			getchar_syx();
		}
		retract();            	//扫描指针回退一个字符
		int c = reserve();
		if (c == 0) {
			buildlist();        //将标识符登录到符号表中
			/*return(id, 指向id的符号表入口指针);*/
			fprintf(fout, "(%d,%s)\t\t指向id的符号表入口指针：\t%d\n", id, identifier_table[present_point], present_point);
		}
		else {
			/*return(保留字码，null);*/
			fprintf(fout, "(%d,%s)\n", c, reserved_Word[c]);
		}
		break;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		/*数字开头，为常数*/
		while (digit()) {
			/*常数只包括数字*/
			concatenation();
			getchar_syx();
		}
		retract();
		buildlist();          //将常数登录到常数表中
		/*return(num, num的常数表入口指针);*/
		fprintf(fout, "(%d,%d)\t\t指向num的符号表入口指针：\t%d\n", num, number_table[present_point], present_point);
		break;
	case '+':
		/*return('+', null);*/
		fprintf(fout, "(%c,_)\n", '+');
		break;
	case '-':
		/*return('-', null);*/
		fprintf(fout, "(%c,_)\n", '-');
		break;
	case '*':
		/*return('*', null);*/
		fprintf(fout, "(%c,_)\n", '*');
		break;
	case '<':
		getchar_syx();
		if (character == '=')
			fprintf(fout, "(%d,%s)\n", relop, "LE");
		/*reutrn(relop, LE);*/
		else {
			retract();
			/*return(relop, LT);*/
			fprintf(fout, "(%d,%s)\n", relop, "LT");
		}
		break;
	case '=':
		getchar_syx();
		if (character == '=')
			fprintf(fout, "(%d,%s)\n", relop, "EQ");
		/*return(relop, EQ);*/
		else {
			retract();
			fprintf(fout, "(%c,_)\n", '=');
			/*return('=', null);*/
		}
		break;
	case ';':
		fprintf(fout, "(%c,_)\n", ';');
		/*return(';', null);*/
		break;
	default:
		if (character == '\n' || character == '\t' || character == EOF)
			break;
		error();			//调用出错函数
	}
}

8 出错处理

调用error()函数，出现非法字符，显示出错信息。

/*****************************************************************************
 *函数名称：error()
 *函数类型：void
 *参数：void
 *功能：出现非法字符，显示出错信息
 *****************************************************************************/
void error() {
	printf("非法字符：%c\n", character);
}

9 主函数

关于errno返回值，看大佬的 Linux errno详解.
不想了解的话，只要知道err_in和err_out的返回值为0正确就行。

int main() {
	errno_t err_in, err_out;
	err_in = fopen_s(&fin, "E:\\test.txt", "r");
	err_out = fopen_s(&fout, "E:\\testout.txt", "w+");
	while (character != EOF)
		test();
	if (fin)
		fclose(fin);
	if (fout)
		fclose(fout);
	return 0;
}

运行结果

保存在testout.txt中。

(2,if)
(6,atest)		指向id的符号表入口指针：	0
(=,_)
(7,1)		指向num的符号表入口指针：	0
(6,hello)		指向id的符号表入口指针：	1
(6,world)		指向id的符号表入口指针：	2
(;,_)
(3,else)
(6,d)		指向id的符号表入口指针：	3
(=,_)
(6,d)		指向id的符号表入口指针：	3
(*,_)
(6,atest)		指向id的符号表入口指针：	0
(-,_)
(7,1)		指向num的符号表入口指针：	0
(;,_)
(1,while)
(6,btest)		指向id的符号表入口指针：	4
(11,LE)
(7,3)		指向num的符号表入口指针：	1
(6,atest)		指向id的符号表入口指针：	0
(+,_)
(6,c)		指向id的符号表入口指针：	5
(=,_)
(7,1213)		指向num的符号表入口指针：	2
(;,_)
(4,switch)
(6,syx)		指向id的符号表入口指针：	6
(5,case)
(7,1)		指向num的符号表入口指针：	0
(6,atest)		指向id的符号表入口指针：	0
(11,LT)
(6,btest)		指向id的符号表入口指针：	4
(;,_)
(5,case)
(7,2)		指向num的符号表入口指针：	3
(6,atest)		指向id的符号表入口指针：	0
(11,EQ)
(6,btest)		指向id的符号表入口指针：	4
(;,_)

lex实现

%{
	#include <stdio.h>
	#include <stdlib.h>
	int count = 0;
	/*
	 * digit		数字
	 * letter		字母
	 * reservedWord	保留字
	 * identifier	标识符
	 * constant		常数
	 * operator		运算符
	 * delim		空白符
	 * whitespace	空白符组合
	 * other		非法字符
	 */
%}

digit			[0-9]
letter			[a-zA-Z]
reservedWord	[w][h][i][l][e]|[i][f]|[e][l][s][e]|[s][w][i][t][c][h]|[c][a][s][e]
identifier		{letter}({letter}|{digit})*
constant		{digit}+
operator		\+|-|\*|<=|<|==|=|;
delim			[ \t\n\r]
whitespace		{delim}+
other			.
%%
{reservedWord}  {count++;fprintf(yyout,"%d\t(1,‘%s’)\n",count,yytext);}
{identifier}    {count++;fprintf(yyout,"%d\t(2,‘%s’)\n",count,yytext);}
{constant}		{count++;fprintf(yyout,"%d\t(3,‘%s’)\n",count,yytext);}
{operator}      {count++;fprintf(yyout,"%d\t(4,‘%s’)\n",count,yytext);}
{whitespace}    { /* do    nothing*/ }
{other}			{fprintf(yyout,"非法字符：'%s'\n",yytext);}

%%
void main()
{
	yyin=fopen("test.txt","r");
	yyout=fopen("testout.txt","w+");
    	yylex(); /* start the analysis*/
	fclose(yyout);
	fclose(yyin);
}
 int yywrap()
 {
 	return 1;
 }

运行结果

testout.txt

1	(1,‘if’)
2	(2,‘atest’)
3	(4,‘=’)
4	(3,‘1’)
5	(2,‘hello’)
6	(2,‘world’)
非法字符：'!'
7	(4,‘;’)
8	(1,‘else’)
9	(2,‘d’)
10	(4,‘=’)
11	(2,‘d’)
12	(4,‘*’)
13	(2,‘atest’)
14	(4,‘-’)
15	(3,‘1’)
16	(4,‘;’)
17	(1,‘while’)
18	(2,‘btest’)
19	(4,‘<=’)
20	(3,‘3’)
21	(2,‘atest’)
22	(4,‘+’)
23	(2,‘c’)
24	(4,‘=’)
25	(3,‘121’)
26	(4,‘;’)
27	(1,‘switch’)
28	(2,‘syx’)
29	(1,‘case’)
30	(3,‘1’)
31	(2,‘atest’)
32	(4,‘<’)
33	(2,‘btest’)
34	(4,‘;’)
35	(1,‘case’)
36	(3,‘2’)
37	(2,‘atest’)
38	(4,‘==’)
39	(2,‘btest’)
40	(4,‘;’)