自制编译器：词法单元解析

最新推荐文章于 2024-06-11 17:27:40 发布

yqtaowhu

最新推荐文章于 2024-06-11 17:27:40 发布

阅读量1.8k

点赞数

分类专栏： Alogrithm 文章标签：编译器

本文链接：https://blog.csdn.net/taoyanqi8932/article/details/51778573

版权

Alogrithm 专栏收录该内容

20 篇文章 1 订阅

订阅专栏

最终的表现形式如图
这里写图片描述
词法单元的状态有

//指示目前所在的状态
enum State {
    IDENTIFIER_ST,   //开始识别标识符
    INTEGER_ST,      //识别数字
    OPERATION_ST,    //识别符号
    EOF_ST,          
    NONE_ST         
};

在词法解析的编程中，主要的函数为：
Token GetNextToken();
它指导了程序在执行的状态。
1.state=IDENTIFIER_ST，即以字母或下划线开头，是标识符或者关键字，调用下面的函数进行读取其余的字符。

void Scanner::AtIdentifier()  //判断其余字符
{
    while(isalnum(c) || c == '_')   //字母数字或下划线
    {
        AddToBuffer(c);          //加入缓冲区
        ExtractChar();          //读取下一个字符，一直到不满足条件
    }
    IdentifyAndMake();  //即识别是关键字，和打标签
}

2.State=INTEGER_ST
判断一个以数字开头，如123，则在识别1的时候进入AtInterger()函数，一直检测到字符不是数字为止，然后直接打上标签，因为，它不像关键字和标识符一样，关键字是存在保留字的Map中，因此要判断标识符是不是关键字，在进行打标签，而数字的话，直接进行打标签

void Scanner::AtInterger()
{
    while (isdigit(c))    //
    {
        AddToBuffer(c);
        ExtractChar();
    }
    /*if (tolower(c) == 'e' || (c == '.' && in.peek() != '.'))
    {
        AddToBuffer(c);
        ExtractChar();
        EatRealFractPart();
    }*/
    MakeToken(INT_CONST);          

}

3.State=OPERATION_ST
关系操作符如<=要进行下一位的检测。

void Scanner::AtOperation()        //运算符的判断
{
    bool match = false;
    if (!in.eof() && !isalnum(c) && c != '_'&&!isspace(c))
    {
        AddToBuffer(c);
        if (TryToIdentify())   //判断关系操作符   for example   <=此时为真
        {
            match = true;
            ExtractChar();
        }
        else
            ReduceBuffer();

    }
    if (!match && !TryToIdentify())
        throw "illegal expression";
}

下面贴出一些代码供参考
lexer.h

class ReservedWords
{
private :
    map<string, pair<TokenType, TokenValue>>words; //定义一组映射words,string是其表示
public:
    //初始化，word中的关键字
    ReservedWords();
    //查找str是否是定义的关键字
    bool Identify(string& str, TokenType& returned_type, TokenValue& returned_value);
    void Add(string name, TokenType _type, TokenValue _value);//添加到words中

};
class Token {
protected:
    TokenType type;
    TokenValue value;
    int row;
    int column;
    int c;
    char* name;
public:
    bool IsRelationalOp() const;   //判断是否是关系操作符
    bool IsAddingOp() const;       //是否是加减或..
    bool IsMultOp() const;         //是否是乘除..
    bool IsVar() const;
    bool IsConstVar() const;
    Token();                       //默认构造函数
    Token(const char* name_, TokenType type_, TokenValue value_, int line_ = -1, int pos_ = -1);
    //Token(const Token& token);     //复制构造函授
    //Token(TokenValue val);
    //Token(int value_);
    //Token(float value_);
    Token& operator=(const Token& token);      //复制操作符
    //~Token();
    TokenType GetType() const;        
    TokenValue GetValue() const;
    int GetColumn() const;
    int GetRow() const;
    virtual const char* GetName() const;
};

class Scanner
{
private:
    string buffer;
    string buffer_low;
    ReservedWords reserved_words;//将会调用默认构造函数，创作保留字
    int row;                     
    int column;
    char c;
    int first_pos;
    int first_line;
    State state;     //判断状态
    istream& in;
    Token token;                            //调用了默认构造函数
public:
    Scanner(istream& input);//构造函数，输入的是文件路径
    Token GetToken();
    Token GetNextToken();
public:
    bool TryToIdentify();
    void ExtractChar();    //提取字符
    void ReduceBuffer();
    void AtLineComment();    //对注释进行出力
    void AddToBuffer(char c);
    void AtIdentifier();    //在判断标识符
    void AtInterger();
    void AtOperation();
    void IdentifyAndMake();  //判断是否是保留字
    void MakeToken(TokenType type, TokenValue value= TOK_UNRESERVED);//注意这是一个可选参数

};

lexer.cpp

void ReservedWords::Add(string name, TokenType _type, TokenValue _value)
{
    //这里要注意的是怎样进行添加
    words.insert(pair<string, pair<TokenType, TokenValue>>(name, pair<TokenType, TokenValue>(_type, _value)));
}
//
bool ReservedWords::Identify(string& str, TokenType& returned_type, TokenValue& returned_value)
{
    auto i = words.find(str);
    if (i == words.end())    //说明其不是保留字
        return false;
    returned_type = i->second.first;//是保留字
    returned_value = i->second.second;
    return true;
}
//Token
Token::Token() :
    name(strcpy(new char[1], "")),
    type(UNDEFINED),
    value(TOK_UNRESERVED),
    row(0),
    column(0)
{
}
//
Token::Token(const char* name_, TokenType type_, TokenValue value_, int line_, int pos_) :
    name(strcpy(new char[strlen(name_) + 1], name_)),
    type(type_),
    value(value_),
    row(line_),
    column(pos_)
{
}
// 赋值操作符
Token& Token::operator=(const Token& token)
{
    if (name != nullptr)
        delete name;
    name = strcpy(new char[strlen(token.name) + 1],token.name);
    type = token.type;
    value = token.value;
    row = token.row;
    column = token.column;
    return *this;
}
int Token::GetRow() const
{
    return row;
}
//
int Token::GetColumn() const
{
    return column;
}
//
TokenType Token::GetType() const
{
    return type;
}

TokenValue Token::GetValue() const
{
    return value;
}
const char* Token::GetName() const
{
    return name;
}
//Scanner
Scanner::Scanner(istream& input):    //初始化列表，对其进行初始化
    in(input),
    row(1),
    column(0),
    state(NONE_ST),
    c(0)                      
{}
//
bool Scanner::TryToIdentify()
{
    TokenType t;
    TokenValue v;
    if (reserved_words.Identify(buffer_low, t, v))
    {
        MakeToken(t, v);
        return true;
    }
    return false;
}
void Scanner::ExtractChar() 
{
    c = in.get();
    ++column;
}
//
void Scanner::ReduceBuffer()
{
    buffer.resize(buffer.size() - 1);
    buffer_low.resize(buffer_low.size() - 1);
}
//注释
void Scanner::AtLineComment()       //如果以/开头
{
    if (c == '/'&&in.peek() == '/') //下一个字符是/
    {
        do
        {
            ExtractChar();//提取字符 ，即提取下一个字符 /
        } while (c != '\n'&&!in.eof());// c=='\n'说明换行了

    }
}
//
void Scanner::AddToBuffer(char c)
{
    buffer.push_back(c);
    buffer_low.push_back(tolower(c));
}
//
void Scanner::AtIdentifier()  //判断其余字符
{
    while(isalnum(c) || c == '_')   //字母数字或下划线
    {
        AddToBuffer(c);          //加入
        ExtractChar();          //读取下一个字符，一直到不满足条件
    }
    IdentifyAndMake();
}
//
void Scanner::MakeToken(TokenType type, TokenValue value)
{
    token = Token(buffer.c_str(), type, value, first_line, first_pos);  //变成const类型，这里调用了构造函数
    buffer.clear();                  //清空缓存
    buffer_low.clear(); 
    state = NONE_ST;                 //重置状态
}
//
void Scanner::AtInterger()
{
    while (isdigit(c))    //
    {
        AddToBuffer(c);
        ExtractChar();
    }
    /*if (tolower(c) == 'e' || (c == '.' && in.peek() != '.'))
    {
        AddToBuffer(c);
        ExtractChar();
        EatRealFractPart();
    }*/
    MakeToken(INT_CONST);          

}
//
void Scanner::AtOperation()        //运算符的判断
{
    bool match = false;
    if (!in.eof() && !isalnum(c) && c != '_'&&!isspace(c))
    {
        AddToBuffer(c);
        if (TryToIdentify())   //判断关系操作符   for example   <=此时为真
        {
            match = true;
            ExtractChar();
        }
        else
            ReduceBuffer();

    }
    if (!match && !TryToIdentify())
        throw "illegal expression";
}
void Scanner::IdentifyAndMake()
{
    TokenType t;
    TokenValue v;
    if (!reserved_words.Identify(buffer_low, t, v)) //如果不是保留字,注意在Identify函数中
    {                                               //用的是引用，所以会改变t,v的值
        t = IDENTIFIER;           //说明其实普通标识符
        v = TOK_UNRESERVED;       //值为不是保留字
    }
    MakeToken(t, v);
}
Token Scanner::GetToken()
{
    return token;
}
Token Scanner::GetNextToken()
{
    bool match = false;
    do
    {
        ExtractChar();   //从文件中获取一个字符                //列如 begin,首先c=b;加入缓冲区,重新读取c=e
        if (state != NONE_ST) match = true;                    //此时状态为标识符，则调用AtIdentifier
        switch (state)                                                          
        {
        case IDENTIFIER_ST:
            AtIdentifier();
            break;
        case INTEGER_ST:
            AtInterger();
            break;
        case OPERATION_ST:
            AtOperation();
            break;
        case EOF_ST:
            MakeToken(END_OF_FILE);
            break;
        case NONE_ST:         //不做运算
            break;
        default:
            break;
        }
        if (state == NONE_ST)
        {
            AtLineComment();       //注释
            if (c == '\n')
            {
                ++row;
                column= 0;
            }
            else
            {
                first_pos = row;              //第几列
                first_line = column;            //第几行
                if (in.eof())
                {
                    state = EOF_ST;
                }
                else if (!isspace(c))         //空格
                {
                    if (isalpha(c) || c == '_')     //字母或下划线开头
                    {
                        state = IDENTIFIER_ST;     //表明其实标识符
                    }
                    else if (isdigit(c))
                    {
                        state = INTEGER_ST;
                    }
                    else
                    {
                        state = OPERATION_ST;
                    }
                    AddToBuffer(c);
                }
            }
        }
    } while (!match);
    return token;
}