C-语言词法分析器与语法分析器(一)

说明:

为实践《编译原理》中的相关知识,认真完成了课程设计,实现了C-语言的词法分析器与语法分析器

C-语言是C语言的一个子集,语法包括:

整型变量与函数的声明

if else 分支语句

while 循环语句


本篇介绍词法分析器的实现

流程:

  1. 写出该语言的词法规则与正则表达式
  2. 构造DFA
  3. 代码实现

该语言词法规则与正则表达式

1.保留字

int   void   if   else   return   while   

正则表达式即为原串,在代码中当作标识符匹配,匹配完后再与保留字比较

2.标识符

letter = [a-z | A-Z]

digit = [0-9]

正则表达式 letter (letter | digit )*

3.数字

digit = [1-9]

D = 0 | digit

整型:

正则表达式:digit D*

浮点型:

正则表达式:digit (. D*)? (e -? digit D*)?

4.符号

+   -   *   /   %   >   >=   <   <=   [   ]   (   )   {   }   !=   ==    ,    ;

正则表达式即符号本身

5.注释

C = 所有字符

//型:

正则表达式:// C*

/**/型:

正则表达式:/* C* */

构造DFA

标识符:

(不考虑带 _ 的标识符,画多了)

整型:

浮点数:

//型注释:

/**/型注释:

代码实现

用switch case实现DFA

伪代码为:

​switch(state)
{
    case 1:
        c = getnextchar();
        state = goto(state, c);    //根据当前状态与字符判断跳转到哪一个状态
        token.push_back(c);        //将字符保存进token
        break;
    case 2:
        c = getnextchar();
        state = goto(state, c);    //根据当前状态与字符判断跳转到哪一个状态
        token.push_back(c);        //将字符保存进token
        break;
    case 接受状态:
        print(token);              //打印保存token
        break;
    ……
}

​

​

在状态转移的过程中,需要向前额外看一位,判断是否接受当前的token

如在匹配int a = 123;的过程中,匹配到a时,向前看一位是=,于是将a保存为一个token,同时指针退一位,下次从=开始匹配

详细代码:

//Scanner.h
//作者:IuSpet
//作用:将c-源代码转化为token输出

#ifndef Scanner_h
#define Scanner_h
#include"utlib.h"

class Scanner
{
private:
	//char buffer[4096];		//读入源程序的缓冲区
	std::string buffer;
	int pos;						//缓冲区位置
	int syn;						//token类别
	int state;						//DFA中的状态
	std::string sourcename;
	int filepos;
	std::ifstream infile;
	//int tsss;
	const int BUFFERLENGTH = 4096;
public:
	Scanner(const char* s)
	{
		//if (source = fopen(s, "r"));
		//else exit(1);
		sourcename = s;
		infile.open(s);
		pos = 0;
		syn = -1;
		state = 0;
		filepos = 0;
		//fgets(buffer, BUFFERLENGTH, source);
		
	}
	void GetToken();				//在DFA上转移,识别token
	bool IsNum(const char c);
	bool IsLetter(const char c);
	char GetNext();					//获取下一个字符
	void Back();					//向前看完后回溯
	~Scanner()
	{
		infile.close();
	}
};


#endif // !Scaner_h
#pragma once
//Scanner.cpp
//作者:IuSpet
//作用:将c-源代码转化为token输出

#include "scanner.h"


void Scanner::GetToken()
{
	char ch;
	constexpr int TOKENLENGTH = 256;
	char token[TOKENLENGTH];
	memset(token, 0, TOKENLENGTH);
	int tokenpos = 0;
	std::ofstream outfile("D://cminus//token.txt");
	while ((ch = GetNext()) != EOF)
	{
		//todo: 标识符,关键字,整型,浮点数运算符,注释,界符,字符串的dfa
		while (state != 100)
			switch (state)
			{
			case 0:									//开始状态
				if (ch == '+') state = 1;
				else if (ch == '-') state = 2;
				else if (ch == '*') state = 3;
				else if (ch == '/') state = 4;
				else if (ch == '<') state = 5;
				else if (ch == '>') state = 6;
				else if (ch == '=') state = 7;
				else if (ch == ';') state = 8;
				else if (ch == '!') state = 9;
				else if (ch == '[') state = 10;
				else if (ch == ']') state = 11;
				else if (ch == '(') state = 12;
				else if (ch == ')') state = 13;
				else if (ch == '{') state = 14;
				else if (ch == '}') state = 15;
				else if (ch == '"') state = 16;
				else if (ch == ',')state = 17;
				else if (IsLetter(ch)) state = 18;
				else if (IsNum(ch)) state = 19;
				else if (ch == ' ' || ch == '\t' || ch == '\n') state = 100;
				else state = 99;			//异常
				break;
			case 1:									//匹配到 +
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 19;
				else
				{
					Back();
					syn = 8;
					state = 100;
				}				
				break;
			case 2:									//匹配到 -
				token[tokenpos++] = ch;
				syn = 9;
				state = 100;
				break;
			case 3:									//匹配到 *
				token[tokenpos++] = ch;
				syn = 10;
				state = 100;
				break;
			case 4:									//匹配到 /
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '/') state = 20;
				else if (ch == '*') state = 21;
				else
				{
					Back();
					state = 100;
					syn = 11;
				}
				break;
			case 20:								//匹配到 //
				while ((ch = GetNext()) != '\n');
				syn = 31;
				state = 100;
				break;
			case 21:								// 匹配到 /*
				ch = GetNext();
				if (ch == '*') state = 22;
				else state = 21;
				break;
			case 22:								//匹配到 /**
				ch = GetNext();
				if (ch == '*') state = 22;
				else if (ch == '/')state = 23;
				else state = 21;
				break;
			case 23:								//匹配到 /**/
				state = 100;
				syn = 32;
				break;
			case 5:									//匹配到 <
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '=') state = 24;
				else
				{
					Back();;
					state = 100;
					syn = 12;
				}
				break;
			case 24:								//匹配到 <=
				token[tokenpos++] = ch;
				state = 100;
				syn = 23;
				break;
			case 6:									//匹配到 >
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '=') state = 25;
				else
				{
					Back();
					state = 100;
					syn = 13;
				}
				break;
			case 25:								//匹配到 >=
				token[tokenpos++] = ch;
				state = 100;
				syn = 24;
				break;
			case 7:									//匹配到 =
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '=') state = 26;
				else
				{
					Back();
					state = 100;
					syn = 14;
				}
				break;
			case 26:								//匹配到 ==
				token[tokenpos++] = ch;
				state = 100;
				syn = 25;
				break;
			case 8:									//匹配到 ;
				token[tokenpos++] = ch;
				state = 100;
				syn = 15;
				break;
			case 9:									//匹配到 !
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '=') state = 27;
				else state = 99;
				break;
			case 27:								//匹配到 !=
				token[tokenpos++] = ch;
				state = 100;
				syn = 26;
				break;
			case 10:									//匹配到 [
				token[tokenpos++] = ch;
				state = 100;
				syn = 17;
				break;
			case 11:									//匹配到 ]
				token[tokenpos++] = ch;
				state = 100;
				syn = 18;
				break;
			case 12:									//匹配到 (
				token[tokenpos++] = ch;
				state = 100;
				syn = 19;
				break;
			case 13:									//匹配到 )
				token[tokenpos++] = ch;
				state = 100;
				syn = 20;
				break;
			case 14:									//匹配到 {
				token[tokenpos++] = ch;
				state = 100;
				syn = 21;
				break;
			case 15:									//匹配到 }
				token[tokenpos++] = ch;
				state = 100;
				syn = 22;
				break;
			case 16:									//匹配到 "……
				token[tokenpos++] = ch;
				ch = GetNext();
				if (ch == '"') state = 28;
				else state = 16;
				break;
			case 28:									//匹配到 "……"
				token[tokenpos++] = ch;
				state = 100;
				syn = 30;
				break;
			case 17:									//匹配到 ,
				token[tokenpos++] = ch;
				state = 100;
				syn = 16;
				break;
			case 18:									//匹配到字母
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsLetter(ch) || IsNum(ch)) state = 18;	//向前看一位还是数字或字母
				else
				{
					Back();
					state = 29;							//向前看一位不属于标识符
				}
				break;
			case 29:									//判断匹配到的标识符是不是关键字
				if (strcmp(token, "if") == 0)
				{
					state = 100;
					syn = 0;
				}
				else if (strcmp(token, "else") == 0)
				{
					state = 100;
					syn = 1;
				}
				else if (strcmp(token, "int") == 0)
				{
					state = 100;
					syn = 2;
				}
				else if (strcmp(token, "double") == 0)
				{
					state = 100;
					syn = 3;
				}
				else if (strcmp(token, "return") == 0)
				{
					state = 100;
					syn = 4;
				}
				else if (strcmp(token, "void") == 0)
				{
					state = 100;
					syn = 5;
				}
				else if (strcmp(token, "while") == 0)
				{
					state = 100;
					syn = 6;
				}
				else if (strcmp(token, "char") == 0)
				{
					state = 100;
					syn = 7;
				}
				else
				{
					state = 100;
					syn = 27;
				}
				break;
			case 19:									//匹配到的是数字
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 19;				//下一位还是数字
				else if (ch == '.') state = 30;			//下一位是.
				else
				{
					Back();
					state = 100;
					syn = 28;
				}
				break;
			case 30:									//匹配到 digit D* .
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 30;
				else if (ch == 'e') state = 31;
				else
				{
					Back();
					state = 100;
					syn = 29;
				}
				break;
			case 31:									//匹配到 digit D* . D* e
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 32;
				else if (ch == '-')state = 33;
				else
				{
					Back();
					state = 99;
				}
				break;
			case 32:									//匹配到 digit D* . D* e
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 32;
				else
				{
					Back();
					state = 100;
					syn = 29;
				}
				break;
			case 33:									//匹配到 digit D* . D* e -
				token[tokenpos++] = ch;
				ch = GetNext();
				if (IsNum(ch)) state = 32;
				else
				{
					Back();
					state = 99;
				}
				break;
			case 99:									//匹配中出错
				std::cout << std::endl;
				std::cout << "error" << std::endl;
				std::cout << (int)ch << " " << ch << std::endl;
				ch = GetNext();
				while (ch != ' '&&ch != '\t'&&ch != '\n'&&ch != ';') ch = GetNext();
				Back();
				state = 100;
				syn = -1;
			}
		if (state == 100 && syn != -1)					//接受状态
		{
			switch (syn)
			{
			case 0:
			case 1:
			case 2:
			case 3:
			case 4:
			case 5:
			case 6:
			case 7:
				outfile << "<" << "RESERVED WORD," << token << ">" << std::endl;
				//outfile << "<" << token << ">" << std::endl;
				break;
			case 27:
				outfile << "<" << "ID," << token << ">" << std::endl;
				break;
			case 28:
				outfile << "<" << "NUM," << token << ">" << std::endl;
				break;
			case 29:
				outfile << "<" << "DOUBLE," << token << ">" << std::endl;
				break;
			case 30:
				outfile << "<" << "STRING," << token << ">" << std::endl;
				break;
			case 31:
			case 32:
				break;
			default:
				outfile << "<" << "SYMBOL," << token << ">" << std::endl;
				//outfile << "<" << token << ">" << std::endl;
			}
			memset(token, 0, TOKENLENGTH);
			tokenpos = 0;
			state = 0;
			syn = -1;
		}
		if (state = 100) state = 0;
	}
	outfile.close();
}

bool Scanner::IsNum(const char c)
{
	return (c >= '0' && c <= '9');
}

bool Scanner::IsLetter(const char c)
{
	return c >= 'a'&&c <= 'z' || c >= 'A'&&c <= 'Z';
}


char Scanner::GetNext()
{
	if (pos < buffer.length())
	{
		return buffer[pos++];
	}
	else
	{
		if (std::getline(infile, buffer))
		{
			buffer.push_back('\n');
		}
		else
		{
			return EOF;
		}
		pos = 0;
		return buffer[pos++];
	}
}

void Scanner::Back()
{
	pos -= 1;
}

测试

测试源码:

int main()
{
	//annotation1
	int a = 123;
	double b = 12.2e-2;
	char str = "hello world";
	if(a != 123)
	{
		return 1;
	}
	/****
	annotation2
	****/
	else
	{
		b = a * b + a / b;
	}
	return 0;
}

测试结果:

<RESERVED WORD,int>
<ID,main>
<SYMBOL,(>
<SYMBOL,)>
<SYMBOL,{>
<RESERVED WORD,int>
<ID,a>
<SYMBOL,=>
<NUM,123>
<SYMBOL,;>
<RESERVED WORD,double>
<ID,b>
<SYMBOL,=>
<DOUBLE,12.2e-2>
<SYMBOL,;>
<RESERVED WORD,char>
<ID,str>
<SYMBOL,=>
<STRING,"hello world">
<SYMBOL,;>
<RESERVED WORD,if>
<SYMBOL,(>
<ID,a>
<SYMBOL,!=>
<NUM,123>
<SYMBOL,)>
<SYMBOL,{>
<RESERVED WORD,return>
<NUM,1>
<SYMBOL,;>
<SYMBOL,}>

全部源码

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值