说明:
为实践《编译原理》中的相关知识,认真完成了课程设计,实现了C-语言的词法分析器与语法分析器
C-语言是C语言的一个子集,语法包括:
整型变量与函数的声明
if else 分支语句
while 循环语句
本篇介绍词法分析器的实现
流程:
- 写出该语言的词法规则与正则表达式
- 构造DFA
- 代码实现
该语言词法规则与正则表达式
1.保留字
int void if else return while
正则表达式即为原串,在代码中当作标识符匹配,匹配完后再与保留字比较
2.标识符
letter = [a-z | A-Z]
digit = [0-9]
正则表达式 letter (letter | digit )*
3.数字
digit = [1-9]
D = 0 | digit
整型:
正则表达式:digit D*
浮点型:
正则表达式:digit (. D*)? (e -? digit D*)?
4.符号
+ - * / % > >= < <= [ ] ( ) { } != == , ;
正则表达式即符号本身
5.注释
C = 所有字符
//型:
正则表达式:// C*
/**/型:
正则表达式:/* C* */
构造DFA
标识符:
(不考虑带 _ 的标识符,画多了)
整型:
浮点数:
//型注释:
/**/型注释:
代码实现
用switch case实现DFA
伪代码为:
switch(state)
{
case 1:
c = getnextchar();
state = goto(state, c); //根据当前状态与字符判断跳转到哪一个状态
token.push_back(c); //将字符保存进token
break;
case 2:
c = getnextchar();
state = goto(state, c); //根据当前状态与字符判断跳转到哪一个状态
token.push_back(c); //将字符保存进token
break;
case 接受状态:
print(token); //打印保存token
break;
……
}
在状态转移的过程中,需要向前额外看一位,判断是否接受当前的token
如在匹配int a = 123;的过程中,匹配到a时,向前看一位是=,于是将a保存为一个token,同时指针退一位,下次从=开始匹配
详细代码:
//Scanner.h
//作者:IuSpet
//作用:将c-源代码转化为token输出
#ifndef Scanner_h
#define Scanner_h
#include"utlib.h"
class Scanner
{
private:
//char buffer[4096]; //读入源程序的缓冲区
std::string buffer;
int pos; //缓冲区位置
int syn; //token类别
int state; //DFA中的状态
std::string sourcename;
int filepos;
std::ifstream infile;
//int tsss;
const int BUFFERLENGTH = 4096;
public:
Scanner(const char* s)
{
//if (source = fopen(s, "r"));
//else exit(1);
sourcename = s;
infile.open(s);
pos = 0;
syn = -1;
state = 0;
filepos = 0;
//fgets(buffer, BUFFERLENGTH, source);
}
void GetToken(); //在DFA上转移,识别token
bool IsNum(const char c);
bool IsLetter(const char c);
char GetNext(); //获取下一个字符
void Back(); //向前看完后回溯
~Scanner()
{
infile.close();
}
};
#endif // !Scaner_h
#pragma once
//Scanner.cpp
//作者:IuSpet
//作用:将c-源代码转化为token输出
#include "scanner.h"
void Scanner::GetToken()
{
char ch;
constexpr int TOKENLENGTH = 256;
char token[TOKENLENGTH];
memset(token, 0, TOKENLENGTH);
int tokenpos = 0;
std::ofstream outfile("D://cminus//token.txt");
while ((ch = GetNext()) != EOF)
{
//todo: 标识符,关键字,整型,浮点数运算符,注释,界符,字符串的dfa
while (state != 100)
switch (state)
{
case 0: //开始状态
if (ch == '+') state = 1;
else if (ch == '-') state = 2;
else if (ch == '*') state = 3;
else if (ch == '/') state = 4;
else if (ch == '<') state = 5;
else if (ch == '>') state = 6;
else if (ch == '=') state = 7;
else if (ch == ';') state = 8;
else if (ch == '!') state = 9;
else if (ch == '[') state = 10;
else if (ch == ']') state = 11;
else if (ch == '(') state = 12;
else if (ch == ')') state = 13;
else if (ch == '{') state = 14;
else if (ch == '}') state = 15;
else if (ch == '"') state = 16;
else if (ch == ',')state = 17;
else if (IsLetter(ch)) state = 18;
else if (IsNum(ch)) state = 19;
else if (ch == ' ' || ch == '\t' || ch == '\n') state = 100;
else state = 99; //异常
break;
case 1: //匹配到 +
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 19;
else
{
Back();
syn = 8;
state = 100;
}
break;
case 2: //匹配到 -
token[tokenpos++] = ch;
syn = 9;
state = 100;
break;
case 3: //匹配到 *
token[tokenpos++] = ch;
syn = 10;
state = 100;
break;
case 4: //匹配到 /
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '/') state = 20;
else if (ch == '*') state = 21;
else
{
Back();
state = 100;
syn = 11;
}
break;
case 20: //匹配到 //
while ((ch = GetNext()) != '\n');
syn = 31;
state = 100;
break;
case 21: // 匹配到 /*
ch = GetNext();
if (ch == '*') state = 22;
else state = 21;
break;
case 22: //匹配到 /**
ch = GetNext();
if (ch == '*') state = 22;
else if (ch == '/')state = 23;
else state = 21;
break;
case 23: //匹配到 /**/
state = 100;
syn = 32;
break;
case 5: //匹配到 <
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '=') state = 24;
else
{
Back();;
state = 100;
syn = 12;
}
break;
case 24: //匹配到 <=
token[tokenpos++] = ch;
state = 100;
syn = 23;
break;
case 6: //匹配到 >
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '=') state = 25;
else
{
Back();
state = 100;
syn = 13;
}
break;
case 25: //匹配到 >=
token[tokenpos++] = ch;
state = 100;
syn = 24;
break;
case 7: //匹配到 =
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '=') state = 26;
else
{
Back();
state = 100;
syn = 14;
}
break;
case 26: //匹配到 ==
token[tokenpos++] = ch;
state = 100;
syn = 25;
break;
case 8: //匹配到 ;
token[tokenpos++] = ch;
state = 100;
syn = 15;
break;
case 9: //匹配到 !
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '=') state = 27;
else state = 99;
break;
case 27: //匹配到 !=
token[tokenpos++] = ch;
state = 100;
syn = 26;
break;
case 10: //匹配到 [
token[tokenpos++] = ch;
state = 100;
syn = 17;
break;
case 11: //匹配到 ]
token[tokenpos++] = ch;
state = 100;
syn = 18;
break;
case 12: //匹配到 (
token[tokenpos++] = ch;
state = 100;
syn = 19;
break;
case 13: //匹配到 )
token[tokenpos++] = ch;
state = 100;
syn = 20;
break;
case 14: //匹配到 {
token[tokenpos++] = ch;
state = 100;
syn = 21;
break;
case 15: //匹配到 }
token[tokenpos++] = ch;
state = 100;
syn = 22;
break;
case 16: //匹配到 "……
token[tokenpos++] = ch;
ch = GetNext();
if (ch == '"') state = 28;
else state = 16;
break;
case 28: //匹配到 "……"
token[tokenpos++] = ch;
state = 100;
syn = 30;
break;
case 17: //匹配到 ,
token[tokenpos++] = ch;
state = 100;
syn = 16;
break;
case 18: //匹配到字母
token[tokenpos++] = ch;
ch = GetNext();
if (IsLetter(ch) || IsNum(ch)) state = 18; //向前看一位还是数字或字母
else
{
Back();
state = 29; //向前看一位不属于标识符
}
break;
case 29: //判断匹配到的标识符是不是关键字
if (strcmp(token, "if") == 0)
{
state = 100;
syn = 0;
}
else if (strcmp(token, "else") == 0)
{
state = 100;
syn = 1;
}
else if (strcmp(token, "int") == 0)
{
state = 100;
syn = 2;
}
else if (strcmp(token, "double") == 0)
{
state = 100;
syn = 3;
}
else if (strcmp(token, "return") == 0)
{
state = 100;
syn = 4;
}
else if (strcmp(token, "void") == 0)
{
state = 100;
syn = 5;
}
else if (strcmp(token, "while") == 0)
{
state = 100;
syn = 6;
}
else if (strcmp(token, "char") == 0)
{
state = 100;
syn = 7;
}
else
{
state = 100;
syn = 27;
}
break;
case 19: //匹配到的是数字
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 19; //下一位还是数字
else if (ch == '.') state = 30; //下一位是.
else
{
Back();
state = 100;
syn = 28;
}
break;
case 30: //匹配到 digit D* .
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 30;
else if (ch == 'e') state = 31;
else
{
Back();
state = 100;
syn = 29;
}
break;
case 31: //匹配到 digit D* . D* e
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 32;
else if (ch == '-')state = 33;
else
{
Back();
state = 99;
}
break;
case 32: //匹配到 digit D* . D* e
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 32;
else
{
Back();
state = 100;
syn = 29;
}
break;
case 33: //匹配到 digit D* . D* e -
token[tokenpos++] = ch;
ch = GetNext();
if (IsNum(ch)) state = 32;
else
{
Back();
state = 99;
}
break;
case 99: //匹配中出错
std::cout << std::endl;
std::cout << "error" << std::endl;
std::cout << (int)ch << " " << ch << std::endl;
ch = GetNext();
while (ch != ' '&&ch != '\t'&&ch != '\n'&&ch != ';') ch = GetNext();
Back();
state = 100;
syn = -1;
}
if (state == 100 && syn != -1) //接受状态
{
switch (syn)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
outfile << "<" << "RESERVED WORD," << token << ">" << std::endl;
//outfile << "<" << token << ">" << std::endl;
break;
case 27:
outfile << "<" << "ID," << token << ">" << std::endl;
break;
case 28:
outfile << "<" << "NUM," << token << ">" << std::endl;
break;
case 29:
outfile << "<" << "DOUBLE," << token << ">" << std::endl;
break;
case 30:
outfile << "<" << "STRING," << token << ">" << std::endl;
break;
case 31:
case 32:
break;
default:
outfile << "<" << "SYMBOL," << token << ">" << std::endl;
//outfile << "<" << token << ">" << std::endl;
}
memset(token, 0, TOKENLENGTH);
tokenpos = 0;
state = 0;
syn = -1;
}
if (state = 100) state = 0;
}
outfile.close();
}
bool Scanner::IsNum(const char c)
{
return (c >= '0' && c <= '9');
}
bool Scanner::IsLetter(const char c)
{
return c >= 'a'&&c <= 'z' || c >= 'A'&&c <= 'Z';
}
char Scanner::GetNext()
{
if (pos < buffer.length())
{
return buffer[pos++];
}
else
{
if (std::getline(infile, buffer))
{
buffer.push_back('\n');
}
else
{
return EOF;
}
pos = 0;
return buffer[pos++];
}
}
void Scanner::Back()
{
pos -= 1;
}
测试
测试源码:
int main()
{
//annotation1
int a = 123;
double b = 12.2e-2;
char str = "hello world";
if(a != 123)
{
return 1;
}
/****
annotation2
****/
else
{
b = a * b + a / b;
}
return 0;
}
测试结果:
<RESERVED WORD,int>
<ID,main>
<SYMBOL,(>
<SYMBOL,)>
<SYMBOL,{>
<RESERVED WORD,int>
<ID,a>
<SYMBOL,=>
<NUM,123>
<SYMBOL,;>
<RESERVED WORD,double>
<ID,b>
<SYMBOL,=>
<DOUBLE,12.2e-2>
<SYMBOL,;>
<RESERVED WORD,char>
<ID,str>
<SYMBOL,=>
<STRING,"hello world">
<SYMBOL,;>
<RESERVED WORD,if>
<SYMBOL,(>
<ID,a>
<SYMBOL,!=>
<NUM,123>
<SYMBOL,)>
<SYMBOL,{>
<RESERVED WORD,return>
<NUM,1>
<SYMBOL,;>
<SYMBOL,}>