编译原理词法分析器_词法分析器csdn-CSDN博客

本文链接：https://blog.csdn.net/galaxy_yr/article/details/134631244

算法描述

对于给出的源代码，我们按行将其读入，对于每一行单独进行词法分析。

过滤行前后空格
对字符串进行词语的分割
- 有空格则把空格前的字符归为一个词
- 比较上一个字符和当前字符是否需要进行分割
检查词语是否合法
词语合法则按 [待测代码中的单词符号] [TAB] <[单词符号种别],[单词符号内容]> 进行输出，其中，单词符号种别为 KW（关键字）、OP（运算符）、SE（界符）、IDN（标识符）INT（整形数）；单词符号内容 KW、OP、SE 为其编号（见单词表），其余为其值。

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>

using namespace std;

const int WORD_NUM = 26;
const string WORD[WORD_NUM] = {
    "int", "void", "return", "const", "main", "struct", "+",  "-",  "*",  "/",
    "%",   "=",    ">",      "<",     "==",   "<=", ">=", "!=", "&&",
    "||",  "(",    ")",      "{",     "}",    ";",  ",",
};
const string OPERATOR = "+-*/%><=&|";
const string SEPARATER = "(){};,[]";
int kws = 0, kwe = 6, ops = 6, ope = 20, ses = 20, see = 26;

class Analyzer {
  private:
    vector<string> lines;
    vector<string> token;
    string fileName;
    ofstream fout;

    int isWord(string word) {
        for(int i = 0; i < WORD_NUM; i++) {
            if(word == WORD[i])
                return i;
        }
        return -1;
    }
    bool isKeyWord(int idx) {
        return kws <= idx && idx < kwe;
    }
    bool isOperator(int idx) {
        return ops <= idx && idx < ope;
    }
    bool isOperator(char ch) {
        return OPERATOR.find(ch) != OPERATOR.npos;
    }
    bool isSeparater(int idx) {
        return ses <= idx && idx < see;
    }
    bool isSeparater(char ch) {
        return SEPARATER.find(ch) != SEPARATER.npos;
    }
    inline bool isNumber(char ch) {
        return ch >= '0' && ch <='9';
    }
    bool isInt(string word) {
        for(int i = 0; i < word.size(); i++) {
            if(!isNumber(word[i]))
                return false;
        }
        return true;
    }
    inline bool isCharacter(char ch) {
        return ch >= 'a' && ch <= 'z' || ch >='A' && ch <= 'Z';
    }
    bool isPartOfIdentifier(char c) {
        return isCharacter(c) || isNumber(c) || c == '_';
    }
    bool isIdentifier(string word) {
        if(isNumber(word[0])) {
            return false;
        }
        for(int i = 1; i < word.size(); i++) {
            if(!isPartOfIdentifier(word[i]))
                return false;
        }
        return true;
    }
    //输出
    inline void record(string word, string type, string content) {
        char TAB = '\t';
        string msg = word + TAB + "<" + type + "," + content + ">";
        fout << msg << endl;
        token.push_back(msg);
    }
    //int 转 string
    string to_string(int val) {
        stringstream ss;
        ss << val;
        string result;
        ss >> result;
        return result;
    }
    //分析一个单词
    bool anaylyseWord(string word) {
        if(word.empty()) {
            return true;
        }
        int idx = isWord(word);
        if(idx > -1) {
            string type;
            if(isKeyWord(idx)) type = "KW";
            if(isOperator(idx)) type = "OP";
            if (isSeparater(idx)) type = "SE";
            record(word, type, to_string(idx + 1));
            return true;
        } else {
            if(isIdentifier(word)) {
                record(word,"IND", word);
                return true;
            }
            if(isInt(word)) {
                record(word,"INT", word);
                return true;
            }
        }
        fout << "ERROR detected!" << endl;
        cout << "ERROR detected!" << endl;
        return false;
    }
    //去除字符串前后空格
    string trim(string s) { 
        if(s == "") {
            return "";
        }
        int l = 0, r = s.size() - 1;
        while(s[l] == ' ' && l < s.size()) l++;
        while(s[r] == ' ' && r > l) r--;
        return s.substr(l,r + 1);
    }

    //判断两个相邻字符是否需要分割
    bool check(char a, char b) {
        if ((isOperator(a) && !isOperator(b)) ||
            (!isOperator(a) && isOperator(b)) || isSeparater(a) ||
            (!isSeparater(a) && isSeparater(b)))
            return false;
        return true;

    }

  public:
    Analyzer(string fileName) {
        readFile(fileName);
    }
    ~Analyzer() {
        fout.close();
    }
    vector<string> getToken() {
        return token;
    }

    void readFile(string fileName) {
        this->fileName = fileName;
        fstream fin(fileName.c_str());
        if (!fin.is_open()) {
            throw "无法打开文件";
        }
        string line;
        while (getline(fin, line)) {
            line = trim(line);
            if(!line.empty())
                lines.push_back(line);
        }
        fin.close();
        // fout.open("token.txt");
        fout.open(fileName.substr(0,fileName.find_last_of(".")) + ".out");
    }

    void analyse() {
        int l = 0;
        string word = "";
        while(l < lines.size()) {
            string line = lines[l++]; //读入一行
            word.clear();
            for(int i = 0; i < line.size(); i++) {
                if(line[i] == ' ' || line[i] == '\t') { //分割单词
                    if(!anaylyseWord(word)) return; //判断单词是否合法并打印
                    word.clear();
                    continue;
                }
                if(!check(word[word.size() - 1], line[i])) { //分割单词
                    if(!anaylyseWord(word)) return; //判断单词是否合法并打印
                    word.clear();
                }
                word += line[i]; 
            }
            anaylyseWord(word); //到行末结束后，将剩余的拼成一个单词
        }
    }
};

int main() {
    try {
        Analyzer analyzer("a.sy");
        analyzer.analyse();
        system("pause");
    } catch (const char *msg) {
        cout << msg << endl;
    }
    return 0;
}

算法NFA和DFA及单词表

请添加图片描述

单词符号	种类	种别码
int	关键字	1
void	关键字	2
return	关键字	3
const	关键字	4
main	关键字	5
struct	关键字	6
+	运算符	7
-	运算符	8
*	运算符	9
/	运算符	10
%	运算符	11
=	运算符	12
<	运算符	13
>	运算符	14
==	运算符	15
<=	运算符	16
>=	运算符	17
!=	运算符	18
&&	运算符	19
\|\|	运算符	20
(	界符	21
)	界符	22
{	界符	23
}	界符	24
;	界符	25
,	界符	26