C++设计简单的词法分析器
概述:
输入源代码,通过编写的程序,成功识别出保留字,单个运算符(+)、双个运算符(++)、常数,标识符和界符,并且以二元式的形式展示出来。其中常数和标识符要插入常数表和标识符表中,其二元式的值为表中对应的id值
注:长期未使用C/C++,用起来有点生疏,所以掺有部分C函数实现的功能,希望大家能够优化
环境
Codeblocks,win10,c++
设计思想(流程)
1.一符一码,将运算符、界符和保留字编码(1-98范围内自定义添加),将常数编码为99,保留字编码为100
2.文件内容的读入和预处理:将编写好的文本内容读到数组中,对数组进行解析处理,输出二元式
3.定义全局变量如下:
①保留字数组reserveWord
②单字节运算符数组singleOperator
③双字节运算符数组mutilOperator
④Syn判断符号种别
⑤Token保存当前正在判别的单词内容
⑥projectResource保存文件字符数组
⑦pProject指向正在判别的projectResource内的字符
⑧常数表 ConstTbl保存常数
⑨标识符表CharTbl保存标识符
4.编写七个函数,分别实现如下功能
①判断当前字符是否为数字isDigit
②判断当前字符是否为字母isLetter
③判断当前内容是否为保留字isReserve
④判断当前字符是否为运算符isOperater
⑤指针回退Retract
⑥文件内容预处理(清除注释、换行)filterResorce
⑦文件内容扫描,实现词组分离
5.函数调用
(1)从字符串projectResource的开始扫描,分别半段该字符是否为字母,数字,运算符等
(2)当该字符为字母时,将该字母存入token,并且继续判断下以字母是否为字母或数字,此轮判断结束时,判断是否为保留字,并赋予相应的syn种别码
(3)该字符为运算符时,进一步判断是否为双子符运算符
(4)当该字符为数字时,进一步判断下个字符是否为数字
(5)结符判断一次后即可返回
思路很简单,关键研究代码
#include<stdio.h>
#include <string.h>
#include <fstream> //文件流库函数
#include <iostream>
using namespace std;
char reserveWord[32][20]={ //保留字编码 1-32
"stdio", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "printf", "extern",
"float", "for", "goto", "if", "int", "long",
"include", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "main", "unsigned", "void",
"volatile", "while"
};
char singleOperator[24][10]={ //单字节运算符编码 33-57
"+", "-", "*", "/", "<", ">",
";", "(", ")", "^", ",", "#",
"|", "%", "~", "[", "]", "{",
"}", ".", ":", "!", "=", "&"
};
static char mutilOperator[14][10]={ //双字节运算符编码 57-71
"++","--","==","<=",">=","<<",
">>","&&","||","!=","\\","\?",
"\"", "\'"
};
char CharTbl[20][10]={"\0"}; //符号表
char ConstTbl[20][10]={"\0"}; //常数表
int pChar=0; //符号表指针
int pConst=0; //常数表指针
//插入字符表
void insertChar(char token[]){
strcpy(CharTbl[pChar++],token);
}
//插入常数表
void insertConst(char token[]){
strcpy(ConstTbl[pConst++],token);
}
//将文件初始内容读入tmp指向地址空间
char* readFile(char* tmp){
FILE *fp;
int file_size; //保存文件字符数
fp=fopen("./test.txt","r"); //打开文件
fseek(fp,0,SEEK_END); //将文件指针指向该文件的最后
file_size=ftell(fp); //根据指针位置,此时可以算出文件的字符数
printf("文件字符数为 %d\n",file_size);
// tmp=(char *)malloc(file_size*sizeof(char));
tmp=new char[file_size];
memset(tmp,'\0',strlen(tmp)*sizeof(char));
fseek(fp,0,SEEK_SET); //重新将指针指向文件首部
fread(tmp,sizeof(char),file_size,fp); //开始读取整个文件
return tmp;
}
//文件内容预处理,去除//、/n、/**/等注释
char* filterResource(char *r,int pProject){
//留作以后实现
char *tempString;
tempString = new char[pProject];
memset(tempString,'\0',strlen(tempString)*sizeof(char)); //初始化空间,防止产生乱码
int count = 0;
for (int i = 0; i < pProject; i++){
if (r[i] == '/'&&r[i + 1] == '/'){//若为单行注释“//”,则去除注释后面的东西,直至遇到回车换行
while (r[i] != '\n'){
i++;//向后扫描
}
}
if (r[i] == '/'&&r[i + 1] == '*'){//若为多行注释“/* 。。。*/”则去除该内容
i += 2;
while (r[i] != '*' || r[i + 1] != '/'){
i++;//继续扫描
if (r[i] == '$'){
printf("注释出错,没有找到 */,程序结束!!!\n");
// exit(0);
break;
}
}
i += 2;//跨过“*/”
}
if (r[i] != '\n'&&r[i] != '\t'&&r[i] != '\v'&&r[i] != '\r'){//若出现无用字符,则过滤;否则加载
tempString[count++] = r[i];
}
}
strcpy(r, tempString);//产生净化之后的源程序
delete tempString;
return r;
}
//判断是否为字母
int isLetter(char ch){
if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')){return 1;}
return 0;
}
//判断是否为数字
int isDigit(char ch){
if((ch>='0'&&ch<='9')){return 1;}
return 0;
}
//判断是否为运算符,返回编码的相对值
int isOperater(char ch){
int i;
for(i=0;i<sizeof(singleOperator)/10;i++){
if(ch==singleOperator[i][0]){
return i;
}
}
return -1;
}
//判断是否为保留字,返回编码的相对值
int isReserve(char token[]){
int i;
for(i=0;i<sizeof(reserveWord)/20;i++){
if(strcmp(token,reserveWord[i])==0){
return i;
}
}
return 100;
}
//指针回退
void Retract(int &pProject){
pProject--;
}
//每次扫描一个词*/
void Scanner(int& syn,int &pProject,char token[],char projectResorce[]){
char ch,mutil[2]={"\0"};
int ptoken=0,i=0;
syn=0;
ch=projectResorce[pProject++];
while(ch==' '){ //清除单词前空格
ch=projectResorce[pProject++];
}
for (i = 0; i<10; i++){//每次收集前先清空token
token[i] = '\0';
}
if(isLetter(ch)){ //标识符或者保留字的识别
while(isLetter(ch)||isDigit(ch)){
token[ptoken++]=ch;
ch=projectResorce[pProject++];
}
Retract(pProject);
syn=isReserve(token);
// cout<<"token syn " <<token<<syn<<endl;
if(syn==100){ //该单词为标识符
return ;
}
syn+=1;
return ; //返回种别码
}
else if(isDigit(ch)){ //常数的识别
while(isDigit(ch)){
token[ptoken++]=ch;
ch=projectResorce[pProject++];
}
syn=99; //返回种别码
Retract(pProject);
return ;
}
else if((syn=isOperater(ch))!=-1){//判断后并获得单字符的种别码相对值
token[ptoken++]=ch;
mutil[0]=ch; //存储运算符后面的内容
mutil[1]=projectResorce[pProject++];
for(i=0;i<sizeof(mutilOperator)/10;i++){//判断是否为双字节运算符
if(strcmp(mutil,mutilOperator[i])==0){
syn=57+i;
token[ptoken++]=ch;
return ;
}
}
Retract(pProject);
syn=33+syn;
return ;
}
else if(ch=='$'){syn=0;}
else{syn=-1;}
return ;
}
void printTbl(){ //打印字符表和常数表
cout<<"CharTbl"<<endl;
cout<<"id | "<<"value"<<endl;
for(int i=0;i<pChar;i++){
cout<<i<<" | "<<CharTbl[i]<<endl;
}
cout<<"ConstTbl"<<endl;
cout<<"id | "<<"value"<<endl;
for(int i=0;i<pConst;i++){
cout<<i<<" | "<<ConstTbl[i]<<endl;
}
}
int main(){
int pProject=0,syn=-1;
char token[10];
char *projectResorce;
projectResorce = readFile(projectResorce); //读取文件内容
cout<<"source code:\n"<<projectResorce<<endl;
projectResorce = filterResource(projectResorce,strlen(projectResorce));//字符串预处理
cout<<"干净的字符串:\n "<<projectResorce<<endl;
pProject=0; //从零开始判别
while(syn!=0){ //输出二元式
Scanner(syn,pProject,token,projectResorce);
if(syn==99){
insertConst(token);
cout<<"<常数,"<<pConst-1<<">"<<endl;
}else if(syn==100){
insertChar(token);
cout<<"<标识符,"<<pChar-1<<">"<<endl;
}else if(syn>=1&&syn<=71){
cout<<"<"<<syn<<",-->" <<endl; //("<<syn<<")"
}else if(syn==-1){
cout<<"error code!\n"<<endl;
break;
}
else{ cout<<"compile finished!\n"<<endl; }
//输出和写入文件
}
printTbl();//打印标识符表和常数表
delete projectResorce; //释放空间
return 0;
}
运行截图如下: