编译原理之词法分析（C语言描述）

最新推荐文章于 2023-01-05 18:18:04 发布

誓约胜利之舰

最新推荐文章于 2023-01-05 18:18:04 发布

阅读量1.7k

点赞数 2

文章标签：词法分析编译原理编译器

本文链接：https://blog.csdn.net/qq_40818798/article/details/89219371

版权

实验环境

编写语言：C
测试语言：C
开发环境：Visual Studio 2019

实验过程

单词分类表

单词类型	种别码
关键字	一词一码
标识符_变量名	多词一码
标识符_函数名	多词一码
标识符_数组名	多词一码
常数	多词一码
运算符与界符	一词一码

关键字：auto       break    case     char        const      continue

            default    do       double   else        enum       extern

            float      for      goto     if          int        long

            register   return   short    signed      sizeof     static

            struct     switch   typedef union       unsigned   void

            volatile    while
界符： ‘/*’、‘//’、 () { } [ ] " " '
运算符： <、<=、>、>=、=、+、-、*、/、^、
对所有可数符号进行编码：

    <auto,1>...<while,32>

    <+，33><-,34><*,35></,36><<,37><<=,38><>,39><>=,40><=,41><==,42><!=,43><;,44><(,45>

    <),46><^,47><,,48><",49><',50><#,51><&,52><&&,53><|,54><||,55><%,56><~,57><<<,58>左移<>>,59>右移

    <[,60><],61><{,62><},63><\,64><.,65><?,66><:,67><!,68>"[","]","{","}
算法描述
预处理程序算法(去除源程序中的注释与无效字符)

 if (r[i] == '/'&&r[i + 1] == '/')//若为单行注释“//”,则去除注释后面的东西，直至遇到回车换行
            while (r[i] != '\n')
                 i++;//向后扫描
            if (r[i] == '/'&&r[i + 1] == '*')
            {//若为多行注释“/* 。。。*/”则去除该内容
                  i += 2;
                  while (r[i] != '*' || r[i + 1] != '/')
                  {
                       i++;//继续扫描
                       if (r[i] == '\0')
                       {
                             printf("注释出错，没有找到 */，程序结束！！！\n");
                             exit(0);
                       }
                  }
                  i += 2;//跨过“*/”
            }
            if (r[i] != '\n'&&r[i] != '\t'&&r[i] != '\v'&&r[i] != '\r')
                  tmpStr[count++] = r[i];//若出现无用字符，则过滤；否则加载
            /*宏替换  未实现*/

词法分析子程序

   if (IsLetter(*tmpSrcTxt))//开头为字母或下划线
      {
            token[count++] = *tmpSrcTxt++;//收集并下移
            while (IsLetter(*tmpSrcTxt) || IsDigit(*tmpSrcTxt))//后跟字母或数字或下划线
                  token[count++] = *tmpSrcTxt++;//收集并下移
            token[count] = '\0';
            *type = searchKeyWords(KeyWords, token);//查表找到种别码
            if (*type == -1)

                  *type = ID;//若不是保留字则是标识符
      }
      else if (IsDigit(*tmpSrcTxt))//首字符为数字
      {
            while (IsDigit(*tmpSrcTxt))//后跟数字
                token[count++] = *tmpSrcTxt++;//收集
            token[count] = '\0';
            *type = CONST;//常数种别码
      }
      else if (ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == ';' || ch == '(' || ch == ')' || ch == '^'|| ch == ',' || ch == '\"' || ch == '\'' || ch == '~' || ch == '#' || ch == '%' || ch == '['|| ch == ']' || ch == '{' || ch == '}' || ch == '\\' || ch == '.' || ch == '\?' || ch == ':')

      {//若为运算符或者界符，查表得到结果
            token[0] = *tmpSrcTxt;
            token[1] = '\0';//形成单字符串
            for (i = 0; i < 36; i++)
            {//查运算符界符表
                  if (strcmp(token, Operator[i]) == 0)
                  {
                       *type = 33 + i;//获得种别码，使用了一点技巧，使之呈线性映射
                       break;//查到即推出
                  }
            }
            tmpSrcTxt++;//指针下移，为下一扫描做准备
      }

完整程序

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
using namespace std;
/*	
	第一类：标识符
	第二类：常数
	第三类：保留字(32)
	auto       break    case     char        const      continue
	default    do       double   else        enum       extern
	float      for      goto     if          int        long
	register   return   short    signed      sizeof     static
	struct     switch   typedef  union       unsigned   void
	volatile    while
	第四类：界符  ‘/*’、‘//’、 () { } [ ] " "  '
	第五类：运算符 <、<=、>、>=、=、+、-、*、/、^、
	对所有可数符号进行编码：
	<auto,1>...<while,32>
	<+，33><-,34><*,35></,36><<,37><<=,38><>,39><>=,40><=,41><==,42><!=,43><;,44><(,45>
	<),46><^,47><,,48><",49><',50><#,51><&,52><&&,53><|,54><||,55><%,56><~,57><<<,58>左移<>>,59>右移
	<[,60><],61><{,62><},63><\,64><.,65><?,66><:,67><!,68>"[","]","{","}"<常数99  ,数值><标识符100 ，标识符指针>
*/
#define ID 100		//标识符
#define	CONST 99 	//常量
#define	STRING 98 	//字符串
//#define	DIVIDE 5 	//分界符
//#define	ALU 6		//运算符
#define ID_A 7		//标识符_数组名
#define ID_B 8		//标识符_变量名
#define ID_F 9		//标识符_函数名
#define INVALID -1	//无效字符
#define MAXTEXT 10000	//可允许的最大文本(字符)长度
#define MAXID	1000	//可容纳的最大标识符个数
#define IDLEN	30		//可允许的标识符最大长度为30
#define MAXKEY	32		//关键字个数

//关键字表
static char KeyWords[MAXKEY][IDLEN] = 
{
	"auto", "break", "case", "char", "const", "continue",
	"default", "do", "double", "else", "enum", "extern",
	"float", "for", "goto", "if", "int", "long",
	"register", "return", "short", "signed", "sizeof", "static",
	"struct", "switch", "typedef", "union", "unsigned", "void",
	"volatile", "while"
};
//界符运算符表
static char Operator[36][10] = 
{
	"+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
	"!=", ";", "(", ")", "^", ",", "\"", "\'", "#", "&",
	"&&", "|", "||", "%", "~", "<<", ">>", "[", "]", "{",
	"}", "\\", ".", "\?", ":", "!"
};
//标识符表——待添加
static  char IDentifierTbl[MAXID][IDLEN] = { "" };

/********查找关键字********/
int searchKeyWords(char KeyWords[][IDLEN], char *s)
{
	for (int i = 0; i < MAXKEY; i++)
	{
		if (strcmp(KeyWords[i], s) == 0)//若成功查找，则返回种别码
			return i + 1;//返回种别码
	}
	return -1;//否则返回-1，代表查找不成功，即为标识符
}

/********判断是否为字母或下划线********/
bool IsLetter(char letter)
{
	if (letter >= 'a'&&letter <= 'z' || letter >= 'A'&&letter <= 'Z' || letter == '_')
		return true;
	else
		return false;
}

/********判断是否为数字***********/
bool IsDigit(char digit)
{
	if (digit >= '0'&&digit <= '9')
		return true;
	else
		return false;
}

/**********编译预处理，去除无用的字符和注释************/
void filterResource(char *r, int *rLen)
{
	if (r == NULL || rLen == NULL)
		return;
	char tmpStr[MAXTEXT];
	char From[20] = { 0 };	//替换前
	char To[20] = { 0 };	//替换后
	char cur = 0;
	int count = 0;
	for (int i = 0; i <= *rLen; i++)
	{
		if (r[i] == '/'&&r[i + 1] == '/')//若为单行注释“//”,则去除注释后面的东西，直至遇到回车换行
			while (r[i] != '\n')
				i++;//向后扫描
		if (r[i] == '/'&&r[i + 1] == '*')
		{//若为多行注释“/* 。。。*/”则去除该内容
			i += 2;
			while (r[i] != '*' || r[i + 1] != '/')
			{
				i++;//继续扫描
				if (r[i] == '\0')
				{
					printf("注释出错，没有找到 */，程序结束！！！\n");
					exit(0);
				}
			}
			i += 2;//跨过“*/”
		}
		if (r[i] != '\n'&&r[i] != '\t'&&r[i] != '\v'&&r[i] != '\r')
			tmpStr[count++] = r[i];//若出现无用字符，则过滤；否则加载
		/*宏替换  未实现*/
		//if (r[i] == '#')
		//{
		//	char preCompiler[20] = { 0 };
		//	char j = 0;
		//	i++;	//跳过#
		//	while (r[i] != ' ')
		//		preCompiler[j++]=r[i++];
		//	j = 0;
		//	i++;	//跳过空格
		//	if (strcmp(preCompiler, "define")==0)
		//	{
		//		cur = i;
		//		while (r[i] != ' ')
		//			From[j++] = r[i++];
		//		j = 0;
		//		i++;
		//		
		//		while (r[i] != ' '&& r[i] != '\n')
		//			To[j++] = r[i++];
		//		j = 0;
		//		i++;
		//	}
		//	i = cur;
		//}
		//if (r[i] == *From)	//匹配到宏替换的首字符
		//{
		//	char size = strlen(From);
		//	char j = 0, k = 0;
		//	while (r[i++] == From[j++]);
		//	if (j == size+1)	//完全匹配 开启替换
		//		tmpStr[count] = To[k++];
		//}
	}
	tmpStr[count] = '\0';
	*rLen = count;	//修改净化之后源程序的长度
	strcpy(r, tmpStr);//产生净化之后的源程序
}

/****************************分析子程序，算法核心***********************/
void Scanner(int *type/*out*/, char *srcTxt/*in*/, char *token/*out*/,int *pCur/*inout*/,int pSrc)//类型号 源文件 标识符 当前指针 结束指针
{
	int i, count = 0;//count用来做token[]的指示器，收集有用字符
	char *tmpSrcTxt = srcTxt + *pCur;	//初始地址+偏移量
	char ch = *tmpSrcTxt;//作为判断使用
	if (ch == -1)	//文件结束符
	{
		*pCur = pSrc;
		*type = INVALID;
		return;
	}
	for (i = 0; i < 20; i++)//每次收集前先清零
		token[i] = '\0';
	if (IsLetter(*tmpSrcTxt))//开头为字母或下划线
	{
		token[count++] = *tmpSrcTxt++;//收集并下移
		while (IsLetter(*tmpSrcTxt) || IsDigit(*tmpSrcTxt))//后跟字母或数字或下划线
			token[count++] = *tmpSrcTxt++;//收集并下移
		token[count] = '\0';
		*type = searchKeyWords(KeyWords, token);//查表找到种别码
		if (*type == -1)
			*type = ID;//若不是保留字则是标识符
	}
	else if (IsDigit(*tmpSrcTxt))//首字符为数字
	{
		while (IsDigit(*tmpSrcTxt))//后跟数字
			token[count++] = *tmpSrcTxt++;//收集
		token[count] = '\0';
		*type = CONST;//常数种别码
	}
	else if (ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == ';' || ch == '(' || ch == ')' || ch == '^'
		|| ch == ',' || ch == '\"' || ch == '\'' || ch == '~' || ch == '#' || ch == '%' || ch == '['
		|| ch == ']' || ch == '{' || ch == '}' || ch == '\\' || ch == '.' || ch == '\?' || ch == ':')
	{//若为运算符或者界符，查表得到结果
		token[0] = *tmpSrcTxt;
		token[1] = '\0';//形成单字符串
		for (i = 0; i < 36; i++)
		{//查运算符界符表
			if (strcmp(token, Operator[i]) == 0)
			{
				*type = 33 + i;//获得种别码，使用了一点技巧，使之呈线性映射
				break;//查到即推出
			}
		}
		tmpSrcTxt++;//指针下移，为下一扫描做准备
	}
	else  if (*tmpSrcTxt == '<')
	{//<,<=,<<
		tmpSrcTxt++;//后移，超前搜索
		if (*tmpSrcTxt == '=')
			*type = 38;
		else if (*tmpSrcTxt == '<')
			*type = 58;
		else
		{
			tmpSrcTxt--;
			*type = 37;
		}
		tmpSrcTxt++;//指针下移
	}
	else  if (*tmpSrcTxt == '>')
	{//>,>=,>>
		tmpSrcTxt++;
		if (*tmpSrcTxt == '=')
			*type = 40;
		else if (*tmpSrcTxt == '>')
			*type = 59;
		else
		{
			tmpSrcTxt--;
			*type = 39;
		}
		tmpSrcTxt++;
	}
	else  if (*tmpSrcTxt == '=')
	{//=.==
		tmpSrcTxt++;
		if (*tmpSrcTxt == '=')
			*type = 42;
		else
		{
			tmpSrcTxt--;
			*type = 41;
		}
		tmpSrcTxt++;
	}
	else  if (*tmpSrcTxt == '!')
	{//!,!=
		tmpSrcTxt++;
		if (*tmpSrcTxt == '=')
			*type = 43;
		else
		{
			*type = 68;
			tmpSrcTxt--;
		}
		tmpSrcTxt++;
	}
	else  if (*tmpSrcTxt == '&')
	{//&,&&
		tmpSrcTxt++;
		if (*tmpSrcTxt == '&')
			*type = 53;
		else
		{
			tmpSrcTxt--;
			*type = 52;
		}
		tmpSrcTxt++;
	}
	else  if (*tmpSrcTxt == '|')
	{//|,||
		tmpSrcTxt++;
		if (*tmpSrcTxt == '|')
			*type = 55;
		else
		{
			tmpSrcTxt--;
			*type = 54;
		}
		tmpSrcTxt++;
	}
	else if (*tmpSrcTxt == ' ')//首字符为空格
	{
		while (*tmpSrcTxt == ' ')
			tmpSrcTxt++;
		*type = INVALID;
	}
	else//不能被以上词法分析识别，则出错。
	{	
		printf("error：there is no exist %c \n", *tmpSrcTxt);
		exit(0);
	}
	*pCur = tmpSrcTxt - srcTxt;	//确定每次扫描后的指针位置
}

int main()
{	
	char srcTxt[MAXTEXT] = { 0 };	//打开一个文件，读取其中的源程序,以字符形式存储在srcTxt[MAXTEXT]中
	char token[20] = { 0 };
	int type = -1;	//初始化
	int pSrc = 0;	//源程序指针
	int pCur = 0;	//程序所读到的指针位置
	char idTurn = 0;	//为1时标识符进入可转换状态
	char tmpStr[IDLEN] = { 0 };	//保存此次标识符
	FILE *fpSrc=NULL, *fpPre=NULL, *fpAfterCiFa=NULL;	//源文件，预编译后的文件，词法分析后的文件
	//-------------------
	if ((fpSrc = fopen("F:\\test.txt", "r")) == NULL)	//打开源程序文件
	{
		cout << "can't open this file";
		exit(0);
	}
	while ( (srcTxt[ pSrc++ ] = fgetc(fpSrc) ) != EOF );//将源程序读入srcTxt[]数组
	srcTxt[pSrc] = '\0';
	fclose(fpSrc);
	//--------------------
	cout << "源程序为:" << endl;
	cout << srcTxt << endl;

	filterResource(srcTxt, &pSrc);	//对源程序进行过滤
	--------------------
	//if ((fpPre = fopen("F:\\Precompile.txt", "w")) == NULL)	//生成预编译文件
	//{
	//	cout << "can't open this file";
	//	exit(0);
	//}
	//while ((fputc(fpPre) = srcTxt[pSrc++]) != EOF);//将预处理后的程序写到预编译文件
	//fclose(fpSrc);
	---------------------
	cout << endl << "过滤之后的程序:" << endl;
	cout << srcTxt << endl;
	
	if ((fpAfterCiFa = fopen("F:\\AfterCiFa.txt", "w")) == NULL)	//打开词法分析文件
	{
		cout << "can't open this file";
		exit(0);
	}
	while (pCur < pSrc)	//当文件不结束时
	{
		//启动扫描  类型号 源文件 标识符 当前指针 结束指针
		Scanner(&type, srcTxt, token, &pCur, pSrc);	//返回当前指针位置，判断是否到文件结束
		if (type == INVALID)	//若type为无效字符，则跳过本次循环
			continue;
		switch (type)
		{
			case ID:
				for (int i = 0; i < 1000; i++)
				{//插入标识符表中
					if (strcmp(IDentifierTbl[i], token) == 0)
						break;
					if (strcmp(IDentifierTbl[i], "") == 0)
					{//查找空间
						strcpy(IDentifierTbl[i], token);
						break;
					}
				}
				strcpy(tmpStr, token);	//保存此次标识符
				idTurn = 1;	//开启可转换标志 因为当前为标识符  可转化为数组名 变量名 函数名等
				Scanner(&type, srcTxt, token, &pCur, pSrc);
				if (idTurn == 1 && strcmp(token, "(") == 0)	//函数名
				{
						printf("(函数名,%s)\n", tmpStr);
						fprintf(fpAfterCiFa, "(函数名,%s)\n", tmpStr);
						idTurn = 0;
				}
				else if (idTurn == 1 && strcmp(token, "[") == 0)
				{
					printf("(数组名,%s)\n", tmpStr);
					fprintf(fpAfterCiFa, "(数组名,%s)\n", tmpStr);
					idTurn = 0;
				}
				else
				{
					printf("(变量名,%s)\n", tmpStr);
					fprintf(fpAfterCiFa, "(变量名,%s)\n", tmpStr);
					idTurn = 0;
				}
				break;
			case STRING:	//字符串
				break;
			case CONST:		//常量
				printf("(常数,%s)\n", token);
				fprintf(fpAfterCiFa, "(常数,%s)\n", token);
				break;
			//case DIVIDE:	//分界符
			//	break;
			default:
				break;
		}
		if (type >= 1 && type <= 32)	//关键字
		{
			printf("(关键字%d,%s)\n", type, KeyWords[type - 1]);
			fprintf(fpAfterCiFa, "(关键字%d,%s)\n", type, KeyWords[type - 1]);
		}
		else if (type >= 33 && type <= 68)//运算符
		{
			printf("(界符运算符%d,%s)\n", type, Operator[type - 33]);
			fprintf(fpAfterCiFa, "(界符运算符%d,%s)\n", type, Operator[type - 33]);
		}
	}
	fclose(fpAfterCiFa);
	return 0;
}
//bool readFile(FILE *p, char*buffer)	//将文件内容读到buffer
//{
//}
//bool writeFile(FILE *p, char*buffer)//将buffer内容写入文件
//{
//}