简单的C语言编译器--词法分析器

最新推荐文章于 2021-05-18 05:26:52 发布

weixin_34082177

最新推荐文章于 2021-05-18 05:26:52 发布

阅读量689

点赞数

文章标签： c/c++

原文链接：http://www.cnblogs.com/vachester/p/6884345.html

版权

1. 定义词法单元Tag

首先要将可能出现的词进行分类，可以有不同的分类方式。如多符一类：将所有逗号、分号、括号等都归为一类，或者一符一类，将一个符号归为一类。我这里采用的是一符一类的方式。C代码如下：

    #ifndef TAG_H
    #define TAG_H
    
    namespace Tag {
        //保留字
        const int
            INT = 1, BOOL = 2, MAIN = 3, IF = 4,
            ELSE = 5, FOR = 6, WHILE = 7, FALSE = 8,
            BREAK = 9, RETURN = 10, TRUE = 11 ; 
    
        //运算符
        const int
            NOT = 20, NE = 21, AUTOMINUS =22, MINUS = 23,
            AUTOADD = 24, ADD = 25, OR = 26, 
            AND = 27, MUTIPLY = 28, DIVIDE = 29, MOD = 30,
            EQ = 31, ASSIN = 32, GE = 33, GT = 34,
            LE = 35, LS = 36;
    
        //分界符
        const int 
            COMMA = 40, SEMICOLON = 41, LLBRACKET = 42,
            RLBRACKET = 43, LMBRACKET = 44, RMBRACKET = 45,
            LGBRACKET = 46, RGBRACKET = 47;
    
        //整数常数
        const int NUM = 50;
    
        //标识符
        const int ID = 60;
    
        //错误
        const int ERROR = 404;
    
        //空
        const int  EMPTY = 70;
    
    }

#endif

2. 具体步骤

一个一个字符地扫描测试代码，忽略空白字符，遇到回车时，记录行数加1
要进行区分标识符(即普通变量名字)和保留字
因为将标识符和常数都guiwe各自归为一类，所以要有算法能够识别出一整个常数和完整的标识符
加入适当的非法词检测

3. 设计词法分析类

设计一个词法分析器，当然要包括如何存储一个词法单元，如何扫描(scan)测试代码等，直接上代码：

myLexer.h

    #ifndef MYLEXER_H
    #define MYLEXER_H
    
    #include <fstream>
    #include <string>
    #include <unordered_map>
    #include "tag.h"
    
    
    /*
     * 主要是定义基本的词法单元类，
     * 声明了词法分析类
     */
    
    //存储词法单元
    class Word {
        public:
            Word() = default;
            Word(std::string s, int t) : lexeme(s), tag(t) {};
            std::string getLexeme() { return lexeme; };
            int getTag() { return tag; }
            void setTag(int t) { tag = t; }
            void setLexeme(std::string s) { lexeme = s; }
        private:
            std::string lexeme;
            int tag;
    };
    
    //词法分析器类
    class Lexer {
        public:
            Lexer();
            void reserve(Word w);
            bool readnext(char c, std::ifstream &in);
            Word scan(std::ifstream &in);
            int getLine() { return line; }
        private:
            char peek;
            std::unordered_map<std::string, Word> words;
            int line;
    };
    
    
    #endif

myLexer.cpp

    #include <iostream>
    #include <cctype>
    #include <sstream>
    #include "myLexer.h"
    
    void Lexer::reserve(Word w) {
        words.insert({w.getLexeme(), w});
    }
    
    Lexer::Lexer() {
        //存入保留字，为了区分标识符
        reserve( Word("int", Tag::INT) );
        reserve( Word("bool", Tag::BOOL) );
        reserve( Word("main", Tag::MAIN) );
        reserve( Word("if", Tag::IF) );
        reserve( Word("else", Tag::ELSE) );
        reserve( Word("for", Tag::FOR) );
        reserve( Word("while", Tag::WHILE) );
        reserve( Word("break", Tag::BREAK) );
        reserve( Word("return", Tag::RETURN) );
        reserve( Word("true", Tag::TRUE) );
        reserve( Word("false", Tag::FALSE) );
        
        peek = ' ';
        line = 1;
    
    }
    
    //方便处理像>=,++等这些两个字符连在一起的运算符
    bool Lexer::readnext(char c, std::ifstream &in) {
        in >> peek;
        if( peek != c)
            return false;
        peek = ' ';
        return true;
    }
    
    
    Word Lexer::scan(std::ifstream &in) {
        //跳过空白符
        while(!in.eof()) {
            if(peek == ' ' || peek == '\t') {
                in >> peek;
                continue;
            }
            else if(peek == '\n')
                ++line;
            else
                break;
            in >> peek;
        }
    
        //处理分界符、运算符等
        switch(peek) {
            case '!':
                if(readnext('=', in))
                    return Word("!=", Tag::NE);
                else
                    return Word("!", Tag::NOT);
            case '-':
                if(readnext('-', in))
                    return Word("--", Tag::AUTOMINUS);
                else
                    return Word("-", Tag::MINUS);
            case '+':
                if(readnext('+', in)) 
                    return Word("++", Tag::AUTOADD);
                else
                    return Word("+", Tag::ADD);
            case '|':
                if(readnext('|', in)) 
                    return Word("||", Tag::OR);
                else
                    return Word("error", Tag::ERROR);
            case '&':
                if(readnext('&', in))
                    return Word("&&", Tag::AND);
                else
                    return Word("error", Tag::ERROR);
            case '*':
                in >> peek;
                return Word("*", Tag::MUTIPLY);
            case '/':
                in >> peek;
                return Word("/", Tag::DIVIDE);
            case '%':
                in >> peek;
                return Word("%", Tag::MOD);
            case '=':
                if(readnext('=', in))
                    return Word("==", Tag::EQ);
                else
                    return Word("=", Tag::ASSIN);
            case '>':
                if(readnext('=', in))
                    return Word(">=", Tag::GE);
                else
                    return Word(">", Tag::GT);
            case '<':
                if(readnext('=', in))
                    return Word("<=", Tag::LE);
                else
                    return Word("<", Tag::LS);
            case ',':
                in >> peek;
                return Word(",", Tag::COMMA);
            case ';':
                in >> peek;
                return Word(";", Tag::SEMICOLON);
            case '(':
                in >> peek;
                return Word("(", Tag::LLBRACKET);
            case ')':
                in >> peek;
                return Word(")", Tag::RLBRACKET);
            case '[':
                in >> peek;
                return Word("[", Tag::LMBRACKET);
            case ']':
                in >> peek;
                return Word("]", Tag::RMBRACKET);
            case '{':
                in >> peek;
                return Word("{", Tag::LGBRACKET);
            case '}':
                in >> peek;
                return Word("}", Tag::RGBRACKET);
        }
        
        //处理常数
        if(isdigit(peek)) {
            int v = 0;
            do {
                v = 10*v + peek - 48;
                in >> peek;
            } while(isdigit(peek));
            if(peek != '.')
                return Word(std::to_string(v), Tag::NUM);
        }   
    
    
        //处理标识符
        if(isalpha(peek)) {
            std::ostringstream b;       
            do {
                b << peek;
                in >> peek;
            } while(isalnum(peek) || peek == '_');
    
            std::string tmp = b.str();
    
            //判断是否为保留字
            if(words.find(tmp) != words.end()) 
                return words[tmp];
            else
                return Word(tmp, Tag::ID);
        }
        if(peek != ' ' && peek != '\t' && peek != '\n') 
            return Word("error", Tag::ERROR);
        return Word("empty", Tag::EMPTY);
    }

设计完成后，自己写一个Main函数，在while循环中调用scan函数，每次打印出Word内容，就能够得到

转载于:https://www.cnblogs.com/vachester/p/6884345.html