一、利用Antlr生成的词法分析器
1.首先,先下载GitHub - jiweixing/BIT-MiniCC: A C compiler framework in Java项目并在IDEA中打开,确保安装好jdk1.8。
2.请参考博客编译原理Antlr教程_寒士°、的博客-CSDN博客 安装,配置,打包好MyCGrammer.jar
3.将MyCGrammer.jar放入lib文件夹中
之后点击右上角file->project structure->Libraries
中引入MyCGrammer.jar
4. 在BIT-MiniCC-master\src\bit\minisys\minicc目录下新建文件MyMiniCompiler.java
在BIT-MiniCC-master\src\bit\minisys\minicc\scanner目录下新建文件MyScanner.java
package bit.minisys.minicc;
import MyCGrammer.MyCGrammerLexer;
import MyCGrammer.MyCGrammerParser;
import bit.minisys.minicc.scanner.MyScanner;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.tree.ParseTree;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
public class MyMiniCompiler {
public static void main(String[] args)throws IOException {
String inputFile = "输入文件路径";
InputStream is = System.in;
is = new FileInputStream(inputFile);
ANTLRInputStream input = new ANTLRInputStream(is);
MyCGrammerLexer lexer = new MyCGrammerLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
MyCGrammerParser parser = new MyCGrammerParser(tokens);
ParseTree tree = parser.compilationUnit();
String fName = inputFile.trim();
String temp[] = fName.split("\\\\");
String tokenFileName =temp[temp.length - 1] + ".tokens";
MyScanner myScanner = new MyScanner(tokenFileName,tokens);
}
}
package bit.minisys.minicc.scanner;
import org.antlr.v4.runtime.CommonTokenStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
public class MyScanner {
public MyScanner(String tokenFileName,CommonTokenStream tokens) throws IOException {
FileWriter fileWriter = new FileWriter(new File(tokenFileName));
for(int i=0;i<tokens.getNumberOfOnChannelTokens();i++){
fileWriter.write(tokens.get(i).toString());
fileWriter.write("\n");
}
fileWriter.close();
}
}
5.结果展示
int fibonacci(int num){
int res;
if(num < 1){
res = 0;
}else if(num <= 2){
res = 1;
}else{
res = fibonacci(num-1)+fibonacci(num-2);
}
return res;
}
int main(){
Mars_PrintStr("Please input a number:\n");
int n = Mars_GetInt();
int res = fibonacci(n);
Mars_PrintStr("This number's fibonacci value is :\n");
Mars_PrintInt(res);
return 0;
}
[@0,0:2='int',<28>,1:0]
[@1,4:12='fibonacci',<62>,1:4]
[@2,13:13='(',<8>,1:13]
[@3,14:16='int',<28>,1:14]
[@4,18:20='num',<62>,1:18]
[@5,21:21=')',<9>,1:21]
[@6,22:22='{',<5>,1:22]
[@7,25:27='int',<28>,2:1]
[@8,29:31='res',<62>,2:5]
[@9,32:32=';',<1>,2:8]
[@10,35:36='if',<7>,3:1]
[@11,37:37='(',<8>,3:3]
[@12,38:40='num',<62>,3:4]
[@13,42:42='<',<47>,3:8]
[@14,44:44='1',<63>,3:10]
[@15,45:45=')',<9>,3:11]
[@16,46:46='{',<5>,3:12]
[@17,50:52='res',<62>,4:2]
[@18,54:54='=',<70>,4:6]
[@19,56:56='0',<63>,4:8]
[@20,57:57=';',<1>,4:9]
[@21,60:60='}',<6>,5:1]
[@22,61:64='else',<10>,5:2]
[@23,66:67='if',<7>,5:7]
[@24,68:68='(',<8>,5:9]
[@25,69:71='num',<62>,5:10]
[@26,73:74='<=',<45>,5:14]
[@27,76:76='2',<63>,5:17]
[@28,77:77=')',<9>,5:18]
[@29,78:78='{',<5>,5:19]
[@30,82:84='res',<62>,6:2]
[@31,86:86='=',<70>,6:6]
[@32,88:88='1',<63>,6:8]
[@33,89:89=';',<1>,6:9]
[@34,92:92='}',<6>,7:1]
[@35,93:96='else',<10>,7:2]
[@36,97:97='{',<5>,7:6]
[@37,101:103='res',<62>,8:2]
[@38,105:105='=',<70>,8:6]
[@39,107:115='fibonacci',<62>,8:8]
[@40,116:116='(',<8>,8:17]
[@41,117:119='num',<62>,8:18]
[@42,120:120='-',<57>,8:21]
[@43,121:121='1',<63>,8:22]
[@44,122:122=')',<9>,8:23]
[@45,123:123='+',<56>,8:24]
[@46,124:132='fibonacci',<62>,8:25]
[@47,133:133='(',<8>,8:34]
[@48,134:136='num',<62>,8:35]
[@49,137:137='-',<57>,8:38]
[@50,138:138='2',<63>,8:39]
[@51,139:139=')',<9>,8:40]
[@52,140:140=';',<1>,8:41]
[@53,143:143='}',<6>,9:1]
[@54,146:151='return',<18>,10:1]
[@55,153:155='res',<62>,10:8]
[@56,156:156=';',<1>,10:11]
[@57,158:158='}',<6>,11:0]
[@58,160:162='int',<28>,12:0]
[@59,164:167='main',<62>,12:4]
[@60,168:168='(',<8>,12:8]
[@61,169:169=')',<9>,12:9]
[@62,170:170='{',<5>,12:10]
[@63,173:185='Mars_PrintStr',<62>,13:1]
[@64,186:186='(',<8>,13:14]
[@65,187:212='"Please input a number:\n"',<64>,13:15]
[@66,213:213=')',<9>,13:41]
[@67,214:214=';',<1>,13:42]
[@68,217:219='int',<28>,14:1]
[@69,221:221='n',<62>,14:5]
[@70,223:223='=',<70>,14:7]
[@71,225:235='Mars_GetInt',<62>,14:9]
[@72,236:236='(',<8>,14:20]
[@73,237:237=')',<9>,14:21]
[@74,238:238=';',<1>,14:22]
[@75,241:243='int',<28>,15:1]
[@76,245:247='res',<62>,15:5]
[@77,249:249='=',<70>,15:9]
[@78,251:259='fibonacci',<62>,15:11]
[@79,260:260='(',<8>,15:20]
[@80,261:261='n',<62>,15:21]
[@81,262:262=')',<9>,15:22]
[@82,263:263=';',<1>,15:23]
[@83,266:278='Mars_PrintStr',<62>,16:1]
[@84,279:279='(',<8>,16:14]
[@85,280:317='"This number's fibonacci value is :\n"',<64>,16:15]
[@86,318:318=')',<9>,16:53]
[@87,319:319=';',<1>,16:54]
[@88,322:334='Mars_PrintInt',<62>,17:1]
[@89,335:335='(',<8>,17:14]
[@90,336:338='res',<62>,17:15]
[@91,339:339=')',<9>,17:18]
[@92,340:340=';',<1>,17:19]
[@93,346:351='return',<18>,18:4]
[@94,353:353='0',<63>,18:11]
[@95,354:354=';',<1>,18:12]
[@96,356:356='}',<6>,19:0]
[@97,357:356='<EOF>',<-1>,19:1]
二、C++手撸词法分析器
当时做实验的时候还没有做到后面,自己手撸了一个词法分析器,然而后面的实验根本用不上,所以不如一开始就使用antlr来做,并不推荐此方法。
词法分析器本质是一个DFA,罗列出状态,根据转换条件编写转换函数即可。
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<sstream>
using namespace std;
string keywords[34] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while" };
string symbol[54] = { "[","]","(",")","{","}",".","->","++","--","&","*","+","-","~","!","/","%","<<",">>","<",">","<=",">=","==","!=","^","|","&&","||","?",":",";","...","=","*=","/=","%=","+=","-=","<<=",">>=","&=","^=","|=",",","#","##","<:",":>","<%","%>","%:","%:%:" };
enum DFA
{
INITIAL, //初始态
IDENTIFIER, //标识符
KEYWORD, //关键字
SYMBOL, //特殊符号
CHAR, //字符
MIDCHAR, //字符中间态
INTERGER, //整形常量
STRING, //字符串
MIDSTRING, //字符串中间态
FLOAT, //浮点型常量
};
vector<string>File;
string line;
int lineIndex = 0;
int charIndex = 0;
int tokenNum = 0;
char GetNextChar()
{
char c = NULL;
while (1)
{
if (lineIndex < File.size())
{
line = File.at(lineIndex);
if (charIndex < line.length())
{
c = line.at(charIndex);
charIndex++;
break;
}
else if (charIndex == line.length())
{
c = '\n';
charIndex++;
break;
}
else
{
lineIndex++;
charIndex = 0;
}
}
else
{
break;
}
}
return c;
}
bool isKeyword(string s)
{
for (int i = 0; i < 34; i++)
{
if (keywords[i] == s)
return true;
}
return false;
}
bool isSymbol(string s)
{
for (int i = 0; i < 54; i++)
{
if (symbol[i] == s)
return true;
}
return false;
}
void PreTreatment(string str,string path)
{
char *p, *end, c; //p-动态移动的字符指针,end-指向文件末尾的字符指针,c-存储没一个p指向的字符
char *sq_start, *dq_start; //sq_start-单引号开始位置(single),dq_start-双引号开始(double)
char *lc_start, *bc_start; //lc_start-//的开始位置,bc_start-/*的开始位置
char *hashes_start, *great_pos;//#号开始的位置,>的位置
size_t len; //记录某符号结束和开始的位置之差(长度,偏移量)
p = &str[0];
end = &str[str.size()];
sq_start = dq_start = NULL;
lc_start = bc_start = NULL;
hashes_start = great_pos = NULL;
while (p<end) /*当指针没有到达文件末尾 */
{
c = *p; //用字符变量c存储指针指向的字符
switch (c) //根据c的值做相应处理
{
case '\'': /*处理单引号*/
{
if (dq_start || lc_start || bc_start) //当遇到过双引号、//或/*的时候,则不需要再判断'//'的情况了。
{
p++;
continue; //继续下一个,对while而言的
}
/*******************************以下是没有遇到过双引号或//或/*的时候*******************************/
if (sq_start == NULL) /****如果未遇到单引号****/
{
sq_start = p++; //start指向单引号的开始位置,p指向下一个
}
else /*如果遇到过单引号,sq_start指向单引号开始位置*/
{
len = (p++) - sq_start;
if (len == 2 && *(sq_start + 1) == '\\')
{
/*若遇到 “ '\'' ”这种情况则两个单引号并未匹配,遇到的“'”是“\' ”中的,而不是与sq_start所指向单引号匹配*/
continue;
}
sq_start = NULL; //否则将sq_start置位为NULL
}
/*******************************以上是没有遇到过双引号或//或/*的时候*******************************/
break;
}
case '\"': /*处理双引号*/
{
if (sq_start || lc_start || bc_start) //当遇到过单引号、//或/*的时候,则不需要处理
{
p++;
continue;
}
/*****************以下是没有遇到过单引号或//或/*的时候*****************/
if (dq_start == NULL) /*如果没有遇到过双引号*/
{
dq_start = p++; //标记遇到了双引号
}
else /*如果遇到过单引号,sq_start指向单引号开始位置*/
{
len = (p++) - dq_start;
if (len == 2 && *(dq_start + 1) == '\\')
{
/*若遇到 “ '\'' ”这种情况则两个单引号并未匹配,遇到的“'”是“\' ”中的,而不是与sq_start所指向单引号匹配*/
continue;
}
dq_start = NULL; //否则将sq_start置位为NULL
}
//dq_start = NULL; //如果双引号中不是//,标记为NULL
/*****************以上是没有遇到过单引号或//或/*的时候*****************/
break;
}
case '/': //斜杠,注意这个斜杠也可以是'//',"//",//,/*/中的第二个斜杠,但会在下面第二行代码中被忽略掉
{
if (sq_start || dq_start || lc_start || bc_start) //如果是单引号、双引号、斜杠、/*的后面
{
p++;
continue;
}
/***********************下面是遇到注释//或/*的时候****************************/
c = *(p + 1); //否则c取p指向字符的下一个字符
if (c == '/') //遇到了双斜杠
{
lc_start = p; //标记双斜杠的开始
p += 2; //p指向双斜杠后面的字符
}
else if (c == '*') //遇到了/*
{
bc_start = p; //标记/*的开始
p += 2; //p指向/*后面的字符
}
/*************************上面是遇到注释//或/*的时候**************************/
else
{
p++;
}
break;
}
case '*': //星号,同斜杠,但少了如果遇到/*的情况,因为遇到这种情况后,要判断是不是遇到结束的地方*/了
{
if (sq_start || dq_start || lc_start) //如果是单引号、双引号、斜杠、/*的后面
{
p++;
continue;
}
if (*(p + 1) != '/') //如果星号后面紧跟的不是斜杠,那么忽略过。
{
p++;
continue;
}
p += 2; //否则p指向斜杠后面那个字符。注意下面的清空语句,p指向的那个字符并不会被清除。
int x = p - bc_start;
for (int i = 0; i < x; i++)
{
if (*bc_start == '\n')
{
}
else
{
*bc_start = ' ';
}
bc_start++;
}
//memset(bc_start, ' ', p - bc_start); //清空/* …… */中间的内容包括注释符号本身。
bc_start = NULL;
break;
}
case '\n': /*换行符,主要处理遇到双斜杠时,需要清除双斜杠到\n的前面的字符*/
{
if (lc_start == NULL) //如果还没有遇到双斜杠,那么忽略
{
p++;
continue;
}
c = *(p - 1);
/*如果遇到过双斜杠,清空双斜杠本身和到\n前面的那个字符,p指向下一个字符,/r是回车符(光标退回到最前面),这里要判断
c == '\r'是因为在UNIX系统下文件结尾的换行只有\n,而windows系统下文件结尾的换行为\r\n */
memset(lc_start, ' ', (c == '\r' ? ((p++) - 1) : p++) - lc_start);
lc_start = NULL;
break;
}
case '#':
{
if (sq_start || dq_start || lc_start || bc_start)
{
p++;
continue;
}
else
{
hashes_start = p;
p++;
}
break;
}
case '>':
{
if (sq_start || dq_start || lc_start || bc_start)
{
p++;
continue;
}
else if (hashes_start)
{
memset(hashes_start, ' ', p - hashes_start+1);
}
else
{
p++;
continue;
}
break;
}
default:
p++;
break;
}
/****************************************************
如果遇到双斜杠,这个if语句存在的意义在于万一最后
一行代码是带有双斜杠但没有给换行符\n的,也要清除掉。
*****************************************************/
if (lc_start)
{
memset(lc_start, ' ', p - lc_start);
}
}
ofstream s(path);
s << str;
}
int main()
{
string path1 = "C:\\vscodeWork\\test.c";//测试文件
string path2 = "C:\\vscodeWork\\testresult.pretreat";//处理掉注释的文件
string path3= "C:\\vscodeWork\\examples.tokens";//生成目标文件
ifstream fs(path1);
stringstream ss;
ss << fs.rdbuf();
fs.close();
string str = ss.str();
PreTreatment(str, path2);
ifstream fs1(path2);
while (getline(fs1, line))
{
File.push_back(line);
}
fs1.close();
DFA state = INITIAL;
string tokens = "";
string token = "";
char c = NULL;
bool SymbolFlag = false;
bool pre = false;
bool flag = true;
while (flag)
{
if (!pre)
{
c = GetNextChar();
}
pre = false;
switch (state)
{
case INITIAL:
{
token = "";
//如果读入的第一个字符为字母或是下划线
if (c == '_' || isalpha(c))
{
//其中如果是u或者l,可能为字符
if (c == 'u' || c == 'U' || c == 'l' || c == 'L')
{
state = MIDCHAR;
}
//否则则为标识符
else
{
state = IDENTIFIER;
}
token = token + c;
}
//数字
else if (isdigit(c))
{
state = INTERGER;
token = token + c;
}
//单引号转字符态
else if (c == '\'')
{
state = CHAR;
token = token + c;
}
//双引号转字符串
else if (c == '"')
{
state = STRING;
token = token + c;
}
else if (c == ' '||c=='\n'||c=='\r'||c=='\t')
{
}
//终止
else if (c == NULL)
{
flag = false;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() + 1) + ":" + to_string(charIndex) + "='" + "<'EOF'>" + "',<" + "EOF" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() + 1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
}
//其余情况为运算符
else
{
token = token + c;
SymbolFlag = isSymbol(token);
if (SymbolFlag)
{
state = SYMBOL;
}
else
{
cout << "error" << endl;
}
}
break;
}
case IDENTIFIER:
{
if (isalpha(c) || isdigit(c) || c == '_')
{
state = IDENTIFIER;
token = token + c;
}
else
{
if (isKeyword(token))
{
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length()-1) + ":" + to_string(charIndex-2) + "='" + token + "',<'" + token + "'>," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() -1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
}
else
{
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() - 1) + ":" + to_string(charIndex-2) + "='" + token + "',<" + "Identifier" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length()-1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
}
tokenNum++;
state = INITIAL;
pre = true;
}
break;
}
case MIDCHAR:
{
if (c == '\'')
{
state = CHAR;
token = token + c;
}
else if (c == '"')
{
state = STRING;
token = token + c;
}
else if (c == '8')
{
state = MIDSTRING;
token = token + c;
}
else
{
state = IDENTIFIER;
token = token + c;
}
break;
}
case CHAR:
{
if (c != '\'')
{
state = CHAR;
token = token + c;
}
else
{
token = token + c;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() -1) + ":" + to_string(charIndex-2) + "='" + token + "',<" + "CharacterConstant" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length()-1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
tokenNum++;
state = INITIAL;
}
break;
}
case MIDSTRING:
if (c == '"')
{
state = STRING;
token = token + c;
}
else
{
state = IDENTIFIER;
token = token + c;
}
break;
case STRING:
{
if (c != '"')
{
state = STRING;
token = token + c;
}
else
{
token = token + c;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() -1) + ":" + to_string(charIndex-2) + "='" + token + "',<" + "StringLiteral" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() - 1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
tokenNum++;
state = INITIAL;
}
break;
}
case INTERGER:
{
if (isdigit(c) || c == 'x' || c == 'X' || c == 'A' || c == 'B' || c == 'C' || c == 'D' || c == 'E' || c == 'F' || c == 'a' || c == 'b' || c == 'c' || c == 'd' || c == 'e' || c == 'f' || c == 'L' || c == 'l' || c == 'U' || c == 'u')
{
state = INTERGER;
token = token + c;
}
else if (c == '.')
{
state = FLOAT;
token = token + c;
}
else
{
pre = true;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() - 1) + ":" + to_string(charIndex-2) + "='" + token + "',<" + "IntegerConstant" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() - 1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
tokenNum++;
state = INITIAL;
}
break;
}
case FLOAT:
{
if (isdigit(c) || c == 'e' || c == 'E' || c == 'f' || c == 'F' || c == 'L' || c == 'l' || c == 'p' || c == 'P' || c == '+' || c == '-' || c == 'A' || c == 'B' || c == 'C' || c == 'D' || c == 'a' || c == 'b' || c == 'c' || c == 'd')
{
state = FLOAT;
token = token + c;
}
else
{
pre = true;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() - 1) + ":" + to_string(charIndex-2) + "='" + token + "',<" + "FloatingConstant" + ">," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() - 1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
tokenNum++;
state = INITIAL;
}
break;
}
case SYMBOL:
{
string temp = token + c;
if (!isSymbol(temp))
{
pre = true;
string tokenstream = "";
tokenstream = "[@" + to_string(tokenNum) + "," + to_string(charIndex - token.length() - 1) + ":" + to_string(charIndex-2) + "='" + token + "',<'" + token + "'>," + to_string(lineIndex) + ":" + to_string(charIndex - token.length() - 1) + "]\n";
tokens = tokens + tokenstream;
cout << tokenstream;
tokenNum++;
state = INITIAL;
}
else
{
state = SYMBOL;
token = temp;
}
break;
}
default:
break;
}
}
ofstream s(path3);
s << tokens;
}