一、实验内容
采用C++编程(文件流用起来比较方便),输入C语言风格的字符串进行词法分析,输出<单词,标号>格式的token序列
采用的正则表达式:
1.识别变量名:letter(_|letter|digit)* 禁止_开头
2.识别数字: digit(digit*).digit(digit*)
FA描述:
算法思路:
1.从文件中读入序列
2.While(ptr<size())
{
GetCh()
判断ch是否数字(小数还是整数?)、字母(关键字?合法变量名?非法变量名?)、分隔符(均为一个字符故存在数组中扫描)、特殊字符(考虑到++、--等特殊情况),并转入相应的控制块
}
不足之处:
并未考虑注释的跳过,以及具有二义性的一些字符,如.(小数点还是调用?)、/(注释还是除法?)(与上下文有关)
二、代码
#include<iostream>
#include<string>
#include<fstream>
#include<vector>
using namespace std;
#define MAXBUF 255
vector<char>Input;//需分析的字符串
string inputfn = "test1.txt";//测试样例文件名
string outputfn = "output1.txt";//输出文件名
ifstream inputfile_test(inputfn); // 读取文件
ofstream outputfile(outputfn, ios::trunc); // 写入文件
/*0-31号类型:C语言关键字*/
const char* keyword[32] = {
"if","else","for","break","continue",
"int","float","double","auto","case",
"char","const","default","do","enum","long",
"extern","goto","register","return","short",
"signed","sizeof","static","struct","switch",
"typedef","union","unsigned","void","volatile","while"
};
/*32-39号类型:分隔符*/
const char* border[8] = {",",";","{","}","(",")","[","]" };
/*40-47号类型:运算符*/
const char* arithmetic[8] = { "+","++","+=","-","--","-=","*","/" };
/*48-54号类型:关系运算符*/
const char* relation[7] = { "<","<=","!=","==","=",">",">=" };
int keywordnum = 32;
int bordernum = 8;
int arithnum = 8;
int relationnum = 7;
int consttype=55;//常量标号
int floattype = 56;//小数标号
int idtype = 57;
char ch; //存放读入当前的输入字符
int ptr = 0;//当前读到的位置
void ReadFile_Char()//读取文件
{
inputfile_test.unsetf(ios::skipws);//不跳过空格
if (!inputfile_test.is_open()) // 判断文件是否打开
{
string sErrmsg = "Open File FAIL";
throw sErrmsg;
}
char ch;
while (!inputfile_test.eof())
{
inputfile_test >> ch;
Input.push_back(ch);
}
inputfile_test.close();
}
// 判断ch中的字符是否为字母
bool isLetter(char ch)
{
if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
return 1;
else
return 0;
}
// 判断ch中的字符是否为数字
bool isDigit(char ch)
{
if (ch >= '0' && ch <= '9')
return 1;
else
return 0;
}
//判断ch中的字符是否为分隔符,并返回标号
int isBorder(char ch)
{
int i;
int tokenflag = -1;
for (i = 0; i < bordernum; i++)
{
if (border[i][0]==ch)//分隔符仅一个字符
{
tokenflag = i + keywordnum;
break;
}
}
return tokenflag;
}
//判断ch中的字符是否为关键字,并返回标号
int isKeyword(char* is_res) {
int i,tokenflag;
tokenflag = 0;
for (i = 0; i < keywordnum; i++) {
if ((strcmp(keyword[i], is_res)) == 0)
{
tokenflag = i;
break;
}
}
return tokenflag;
}
void GetCh()
{
ch = Input[ptr];
ptr++;
}
void Analyse() {
if (!outputfile.is_open())
{
string sErrmsg = "Open File FAIL";
throw sErrmsg;
}
char buf[MAXBUF]; //输入缓冲区,存放一个单词符号
int type=-1;//单词种类
int j = 0;
while (ptr < Input.size()) {
GetCh();
//跳过空格与\t
if (ch == ' ' || ch == '\t')
{
}
else if (ch == '\n')
{
}
/*********************字符串*************************/
else if (isLetter(ch)) {//禁止_开头
while (isLetter(ch) || isDigit(ch) || ch == '_') {
if ((ch <= 'Z') && (ch >= 'A'))
ch = ch + 32; /*忽略大小写*/
buf[j] = ch;
j++;
GetCh();
}
ptr--;
buf[j] = '\0';
j = 0;
type = isKeyword(buf);
if (type >0) { //是关键字
outputfile << "<" << buf << " , " << type << " >" << endl;
}
else//普通变量名
{
outputfile << "< " << buf << " , " << idtype << " >" << endl;
}
continue;
}
/*************************数字***************************/
else if (isDigit(ch)) {
int Dflag = 0;
while (isDigit(ch) ||isLetter(ch)||ch=='.') {
if (isLetter(ch)) {
buf[j] = ch;
j++;
GetCh();
Dflag = 2;
}
else if (isDigit(ch)) {
buf[j] = ch;
j++;
GetCh();
}
else if (ch == '.')
{
buf[j] = ch;
j++;
GetCh();
if (isDigit(ch))//.后必须跟数字
{
Dflag = 1;
}
else
{
Dflag = -1; continue;
}
}
}
ptr--;
buf[j] = '\0';
j = 0;
if (Dflag == -1)
{
outputfile << "ERROR: Not a legal decimal expression" << endl;
}
if (Dflag == 0)
{
type = consttype;//整型常数
outputfile << "< " << buf << " , " << type << " >" << endl;
}
if (Dflag == 1)
{
type = floattype;//小数常数
outputfile << "< " << buf << " , " << type << " >" << endl;
}
else if (Dflag == 2)
{
outputfile << "ERROR: Variable cannot start with number" << endl;
}//数字开头接字母,出错
continue;
}
/*************************分界符****************************/
else if (isBorder(ch)>-1) {
type = isBorder(ch);
if (type > -1)
{
outputfile << "< " << ch << " , " << type << " >" << endl;
}
continue;
}
//单独考虑特殊运算符和关系运算符
else switch (ch) {
case'+':
GetCh();
if (ch == '+') {
type = 42;
outputfile << "< " << "++" << " , " << type << " >" << endl;
break;
}
else if (ch == '=') {
type = 43;
outputfile << "< " << "+=" << " , " << type << " >" << endl;
break;
}
else
ptr--;//回退,普通+
type = 41;
outputfile << "< " << "+" << " , " << type << " >" << endl;
break;
case'-':
GetCh();
if (ch == '-') {
type = 45;
outputfile << "< " << "--" << " , " << type << " >" << endl;
break;
}
else if (ch == '=') {
type = 46;
outputfile << "< " << "-=" << " , " << type << " >" << endl;
break;
}
else
ptr--;//回退,普通-
type = 44;
outputfile << "< " << "-" << " , " << type << " >" << endl;
break;
case'*':outputfile << "< " << "*" << " , " << 47 << " >" << endl; break;
case'/':outputfile << "< " << "/" << " , " << 48 << " >" << endl; break;
case'<':
GetCh();
if (ch == '=') {
type = 49;
cout << "< " << "<=" << " , " << type << " >" << endl;
break;
}
else
ptr--;//回退,普通<
type = 50;
outputfile << "< " << "=" << " , " << type << " >" << endl;
break;
case'>':
GetCh();
if (ch == '=') {
type = 51;
outputfile << "< " << ">=" << " , " << type << " >" << endl;
break;
}
else
ptr--;//回退,普通>
type = 52;
outputfile << "< " << ">" << " , " << type << " >" << endl;
break;
case'=':
GetCh();
if (ch == '=') {
type = 53;
outputfile << "< " << "==" << " , " << type << " >" << endl;
break;
}
else
ptr--;//回退,普通-
type = 54;
outputfile << "< " << "=" << " , " << type << " >" << endl;
break;
case'!':
GetCh();
if (ch == '=') {
type = 55;
outputfile << "< " << "!=" << " , " << type << " >" << endl;
break;
}
break;
/***************非法字符*******************/
default:
outputfile << "ERROR: Unrecognized character:" << ch << endl;
}
}
}
int main() {
cout << "Start Reading File……" << endl;
ReadFile_Char();
cout << "Start Analyse……" << endl;
Analyse();
cout << "finish!"<<endl;
outputfile.close();
return 0;
}
三、测试样例与结果分析
测试样例含非法变量名“1z”“_dec”、非法数字及无法识别的字符“#”“@”
观察输出结果中发现对关键字、长变量名均有良好的识别效果,并进行了错误反馈。
- 识别出了非法变量名_dec
- 识别出了非法变量名1z
- 识别出了非法数字10.
- 识别出了#@等未定义的字符