针对简易版c语言的词法分析:
详细思路可直接看代码内注释。
【文法定义】:
<标识符>::=<字母>{<字母>|<数字>} //标识符和关键字都不区分大小写,比如if和IF均为关键字,不允许出现与关键字相同的标识符
<字母>::=_|a|...|z|A|...|Z
<数字>::=0|1|...|9
<整数>::=[+|-]<无符号整数>
<无符号整数>::=<数字>{<数字>}
<字符>::=‘<加法运算符>’|’<乘法运算符>’|’<字母>’|’<数字>’
<加法运算符>::=+|-
<乘法运算符>::=*|/
<字符串>::="{十进制编码为32,33,35-126的ASCII字符}" //字符串中要求至少有一个字符
【问题描述】
请根据给定的文法设计并实现词法分析程序,从源程序中识别出单词,记录其单词类别和单词值,输入输出及处理要求如下:
(1)数据结构和与语法分析程序的接口请自行定义;类别码需按下表格式统一定义;
(2)为了方便进行自动评测,输入的被编译源文件统一命名为testfile.txt(注意不要写错文件名);输出的结果文件统一命名为output.txt(注意不要写错文件名),结果文件中每行按如下方式组织:
单词类别码 单词的字符/字符串形式(中间仅用一个空格间隔)
单词的类别码请统一按如下形式定义:
【输入形式】testfile.txt中的符合文法要求的测试程序。
【输出形式】要求将词法分析结果输出至output.txt中。
#include<iostream>
#include<stdio.h>
#include<cstring>
#include<map>
#include <fstream>
using namespace std;
const int MAXN=1000;
string token[MAXN]; //词法分析token序列
string val[MAXN];
int p = 0;
int cnmd =0;
//看是否为数字
bool NUMBER(char A)
{
if(A>='0'&&A<='9')
return true;
return false;
}
//看是否为字母或下划线
bool LETTER(char A)
{
if((A=='_')||(A>='A'&&A<='Z')||(A>='a'&&A<='z'))
return true;
return false;
}
//看是否是保留字或标识符
bool RESERVEDWORD(string s)
{
if(s=="const")
{
token[p++] = "CONSTTK";
val[cnmd++] = s;
//fprintf(fp,"%s %s\n","CONSTTK",a);
return true;
}
if(s=="int")
{
token[p++] = "INTTK";
val[cnmd++] = s;
//fprintf(fp,"%s %s\n","INTTK",a);
return true;
}
if(s=="char")
{
token[p++] = "CHARTK";
val[cnmd++] = s;
//fprintf(fp,"%s %s\n","CHARTK",a);
return true;
}
if(s=="void")
{
token[p++] = "VOIDTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","VOIDTK",a);
return true;
}
if(s=="main")
{
token[p++] = "MAINTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","MAINTK",a);
return true;
}
if(s=="if")
{
token[p++] = "IFTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","IFTK",a);
return true;
}
if(s=="else")
{
token[p++] = "ELSETK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","ELSETK",a);
return true;
}
if(s=="do")
{
token[p++] = "DOTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","DOTK",a);
return true;
}
if(s=="while")
{
token[p++] = "WHILETK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","WHILETK",a);
return true;
}
if(s=="for")
{
token[p++] = "FORTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","FORTK",a);
return true;
}
if(s=="scanf")
{
token[p++] = "SCANFTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","SCANFTK",a);
return true;
}
if(s=="printf")
{
token[p++] = "PRINTFTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","PRINTFTK",a);
return true;
}
if(s=="return")
{
token[p++] = "RETURNTK";
val[cnmd++] = s;
// fprintf(fp,"%s %s\n","RETURNTK",a);
return true;
}
return false;
}
void cifa()
{
//标识符、整数、字符、字符串、保留字、运算符、界符
FILE *f;
//FILE *fp;
char CHAR;
char cur[120];
f=fopen("testfile.txt","r");
//fp=fopen("output.txt","w");
while((CHAR=fgetc(f))!=EOF)
{
while(CHAR!=' ')
{
if(LETTER(CHAR))//以字母或开头,要么是保留字,要么是标识符
{
string s="";
int k=0;
do{
s+=CHAR;
}while((CHAR=fgetc(f))!=EOF&&CHAR!=' '&&LETTER(CHAR));
if(NUMBER(CHAR))//字母后跟数字
{
do{
s+=CHAR;
}while((CHAR=fgetc(f))!=EOF&&CHAR!=' '&&NUMBER(CHAR));
}
if(!(RESERVEDWORD(s)))//判断标识符是保留字还是标识符
{
// fprintf(fp,"%s %s\n","IDENFR",cur);
token[p++] = "IDENFR";
val[cnmd++] = s;
}
}
else if(NUMBER(CHAR))//以数字开头,只能是整数
{
string s="";
int i=0;
do{
s+=CHAR;
}while((CHAR=fgetc(f))!=EOF&&CHAR!=' '&&NUMBER(CHAR));
//cur[i]='\0';
//fprintf(fp,"%s %s\n","INTCON",cur);
token[p++] = "INTCON";
val[cnmd++] = s;
}
//判断运算符
if(CHAR=='+')
{
token[p++] = "PLUS";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","PLUS",CHAR);
}
if(CHAR=='-')
{
token[p++] = "MINU";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","MINU",CHAR);
}
if(CHAR=='*')
{
token[p++] = "MULT";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","MULT",CHAR);
}
if(CHAR=='/')
{
token[p++] = "DIV";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","DIV",CHAR);
}
if(CHAR=='=')
{
if((CHAR=fgetc(f))!=EOF&&CHAR=='=')
{
token[p++] = "EQL";
val[cnmd++] = "==";
// fprintf(fp,"%s %s\n","EQL","==");
}
else
{
token[p++] = "ASSIGN";
val[cnmd++] = "=";
if(CHAR=='i')
{
token[p++] = "IDENFR";
val[cnmd++] = "i";
}
// fprintf(fp,"%s %s\n","ASSIGN","=");
}
}
if(CHAR=='<')
{
if((CHAR=fgetc(f))!=EOF&&CHAR=='=')
{
token[p++] = "LEQ";
val[cnmd++] = "<=";
// fprintf(fp,"%s %s\n","LEQ","<=");
}
else
{
token[p++] = "LSS";
val[cnmd++] = "<";
// fprintf(fp,"%s %s\n","LSS","<");
}
}
if(CHAR=='>')
{
if((CHAR=fgetc(f))!=EOF&&CHAR=='=')
{
token[p++] = "GEQ";
val[cnmd++] = ">=";
// fprintf(fp,"%s %s\n","GEQ",">=");
}
else
{
token[p++] = "GRE";
val[cnmd++] = ">";
// fprintf(fp,"%s %s\n","GRE",">");
}
}
if(CHAR=='!')
{
if((CHAR=fgetc(f))!=EOF&&CHAR=='=')
{
token[p++] = "NEQ";
val[cnmd++] = "!=";
// fprintf(fp,"%s %s\n","NEQ","!=");
}
}
//判断界符
if(CHAR==';')
{
token[p++] = "SEMICN";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","SEMICN",CHAR);
}
if(CHAR==',')
{
token[p++] = "COMMA";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","COMMA",CHAR);
}
if(CHAR=='(')
{
token[p++] = "LPARENT";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","LPARENT",CHAR);
}
if(CHAR==')')
{
token[p++] = "RPARENT";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","RPARENT",CHAR);
}
if(CHAR=='[')
{
token[p++] = "LBRACK";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","LBRACK",CHAR);
}
if(CHAR==']')
{
token[p++] = "RBRACK";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","RBRACK",CHAR);
}
if(CHAR=='{')
{
token[p++] = "LBRACE";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","LBRACE",CHAR);
}
if(CHAR=='}')
{
token[p++] = "RBRACE";
val[cnmd++] += CHAR;
// fprintf(fp,"%s %c\n","RBRACE",CHAR);
}
//判断字符和字符串
if(CHAR=='\'')
{
string s = "";
//int j=0;
while((CHAR=fgetc(f))!=EOF&&CHAR!='\'')
{
s+=CHAR;
}
token[p++] = "CHARCON";
val[cnmd++] = s;
//fprintf(fp,"%s %s\n","CHARCON",TOKEN);
}
if(CHAR=='"')
{
string s = "";
while((CHAR=fgetc(f))!=EOF&&CHAR!='"')
{
s+=CHAR;
}
//cur[m]='\0';
//fprintf(fp,"%s %s\n","STRCON",TOKEN);
token[p++] = "STRCON";
val[cnmd++] = s;
}
break;
}
}
// for(int i=0;i<p;i++)
// {
// // cout<<token[i]<<" ";
// cout<<val[i]<<" ";
// }
fclose(f);
}
int main()
{
cifa();
//yufafenxi();
return 0;
}
ps:上面的程序已经留好了token序列的接口,后续的语法分析可根据改程序继续开发。