算法描述
对于给出的源代码,我们按行将其读入,对于每一行单独进行词法分析。
- 过滤行前后空格
- 对字符串进行词语的分割
- 有空格则把空格前的字符归为一个词
- 比较上一个字符和当前字符是否需要进行分割
- 检查词语是否合法
- 词语合法则按 [待测代码中的单词符号] [TAB] <[单词符号种别],[单词符号内容]> 进行输出,其中,单词符号种别为 KW(关键字)、OP(运算符)、SE(界符)、IDN(标识符)INT(整形数);单词符号内容 KW、OP、SE 为其编号(见单词表),其余为其值。
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
using namespace std;
const int WORD_NUM = 26;
const string WORD[WORD_NUM] = {
"int", "void", "return", "const", "main", "struct", "+", "-", "*", "/",
"%", "=", ">", "<", "==", "<=", ">=", "!=", "&&",
"||", "(", ")", "{", "}", ";", ",",
};
const string OPERATOR = "+-*/%><=&|";
const string SEPARATER = "(){};,[]";
int kws = 0, kwe = 6, ops = 6, ope = 20, ses = 20, see = 26;
class Analyzer {
private:
vector<string> lines;
vector<string> token;
string fileName;
ofstream fout;
int isWord(string word) {
for(int i = 0; i < WORD_NUM; i++) {
if(word == WORD[i])
return i;
}
return -1;
}
bool isKeyWord(int idx) {
return kws <= idx && idx < kwe;
}
bool isOperator(int idx) {
return ops <= idx && idx < ope;
}
bool isOperator(char ch) {
return OPERATOR.find(ch) != OPERATOR.npos;
}
bool isSeparater(int idx) {
return ses <= idx && idx < see;
}
bool isSeparater(char ch) {
return SEPARATER.find(ch) != SEPARATER.npos;
}
inline bool isNumber(char ch) {
return ch >= '0' && ch <='9';
}
bool isInt(string word) {
for(int i = 0; i < word.size(); i++) {
if(!isNumber(word[i]))
return false;
}
return true;
}
inline bool isCharacter(char ch) {
return ch >= 'a' && ch <= 'z' || ch >='A' && ch <= 'Z';
}
bool isPartOfIdentifier(char c) {
return isCharacter(c) || isNumber(c) || c == '_';
}
bool isIdentifier(string word) {
if(isNumber(word[0])) {
return false;
}
for(int i = 1; i < word.size(); i++) {
if(!isPartOfIdentifier(word[i]))
return false;
}
return true;
}
//输出
inline void record(string word, string type, string content) {
char TAB = '\t';
string msg = word + TAB + "<" + type + "," + content + ">";
fout << msg << endl;
token.push_back(msg);
}
//int 转 string
string to_string(int val) {
stringstream ss;
ss << val;
string result;
ss >> result;
return result;
}
//分析一个单词
bool anaylyseWord(string word) {
if(word.empty()) {
return true;
}
int idx = isWord(word);
if(idx > -1) {
string type;
if(isKeyWord(idx)) type = "KW";
if(isOperator(idx)) type = "OP";
if (isSeparater(idx)) type = "SE";
record(word, type, to_string(idx + 1));
return true;
} else {
if(isIdentifier(word)) {
record(word,"IND", word);
return true;
}
if(isInt(word)) {
record(word,"INT", word);
return true;
}
}
fout << "ERROR detected!" << endl;
cout << "ERROR detected!" << endl;
return false;
}
//去除字符串前后空格
string trim(string s) {
if(s == "") {
return "";
}
int l = 0, r = s.size() - 1;
while(s[l] == ' ' && l < s.size()) l++;
while(s[r] == ' ' && r > l) r--;
return s.substr(l,r + 1);
}
//判断两个相邻字符是否需要分割
bool check(char a, char b) {
if ((isOperator(a) && !isOperator(b)) ||
(!isOperator(a) && isOperator(b)) || isSeparater(a) ||
(!isSeparater(a) && isSeparater(b)))
return false;
return true;
}
public:
Analyzer(string fileName) {
readFile(fileName);
}
~Analyzer() {
fout.close();
}
vector<string> getToken() {
return token;
}
void readFile(string fileName) {
this->fileName = fileName;
fstream fin(fileName.c_str());
if (!fin.is_open()) {
throw "无法打开文件";
}
string line;
while (getline(fin, line)) {
line = trim(line);
if(!line.empty())
lines.push_back(line);
}
fin.close();
// fout.open("token.txt");
fout.open(fileName.substr(0,fileName.find_last_of(".")) + ".out");
}
void analyse() {
int l = 0;
string word = "";
while(l < lines.size()) {
string line = lines[l++]; //读入一行
word.clear();
for(int i = 0; i < line.size(); i++) {
if(line[i] == ' ' || line[i] == '\t') { //分割单词
if(!anaylyseWord(word)) return; //判断单词是否合法并打印
word.clear();
continue;
}
if(!check(word[word.size() - 1], line[i])) { //分割单词
if(!anaylyseWord(word)) return; //判断单词是否合法并打印
word.clear();
}
word += line[i];
}
anaylyseWord(word); //到行末结束后,将剩余的拼成一个单词
}
}
};
int main() {
try {
Analyzer analyzer("a.sy");
analyzer.analyse();
system("pause");
} catch (const char *msg) {
cout << msg << endl;
}
return 0;
}
算法NFA和DFA及单词表
单词符号 | 种类 | 种别码 |
---|---|---|
int | 关键字 | 1 |
void | 关键字 | 2 |
return | 关键字 | 3 |
const | 关键字 | 4 |
main | 关键字 | 5 |
struct | 关键字 | 6 |
+ | 运算符 | 7 |
- | 运算符 | 8 |
* | 运算符 | 9 |
/ | 运算符 | 10 |
% | 运算符 | 11 |
= | 运算符 | 12 |
< | 运算符 | 13 |
> | 运算符 | 14 |
== | 运算符 | 15 |
<= | 运算符 | 16 |
>= | 运算符 | 17 |
!= | 运算符 | 18 |
&& | 运算符 | 19 |
|| | 运算符 | 20 |
( | 界符 | 21 |
) | 界符 | 22 |
{ | 界符 | 23 |
} | 界符 | 24 |
; | 界符 | 25 |
, | 界符 | 26 |