词法分析:识别 Token。
依据构造好的有限自动机,在不同的状态中迁移,从而解析出 Token 来。
python
base_type.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from enum import Enum
'''
Token的类型
'''
class TokenType(Enum):
Plus = 0 # +
Minus = 1 # -
Star = 2 # *
Slash = 3 # /
GE = 4 # >=
GT = 5 # >
EQ = 6 # ==
LE = 7 # <=
LT = 8 # <
SemiColon = 9 # ;
LeftParen = 10 # (
RightParen = 11 # )
Assignment = 12 # =
If = 13
Else = 14
Int = 15
Identifier = 16 # 标识符
IntLiteral = 17 # 整型字面量
StringLiteral = 18 # 字符串字面量
'''
一个简单的Token。
只有类型和文本值两个属性。
'''
class Token(object):
def get_type(self): # Token的类型
pass
def get_text(self): # Token的文本值
pass
'''
AST节点的类型。
'''
class ASTNodeType(Enum):
Programm = 0 # 程序入口,根节点
IntDeclaration = 1 # 整型变量声明
ExpressionStmt = 2 # 表达式语句,即表达式后面跟个分号
AssignmentStmt = 3 # 赋值语句
Primary = 4 # 基础表达式
Multiplicative = 5 # 乘法表达式
Additive = 6 # 加法表达式
Identifier = 7 # 标识符
IntLiteral = 8 # 整型字面量
'''
AST的节点。
属性包括AST的类型、文本值、下级子节点和父节点
'''
class ASTNode(object):
def get_parent(self): # 父节点
pass
def get_children(self): # 子节点
pass
def get_type(self): # AST类型
pass
def get_text(self): # 文本值
pass
'''
一个Token流。由Lexer生成。Parser可以从中获取Token。
'''
class TokenReader(object):
'''
返回Token流中下一个Token,并从流中取出。 如果流已经为空,返回null;
'''
def read(self):
pass
'''
返回Token流中下一个Token,但不从流中取出。 如果流已经为空,返回null;
'''
def peek(self):
pass
'''
Token流回退一步。恢复原来的Token。
'''
def unread(self):
pass
'''
获取Token流当前的读取位置。
'''
def get_position(self):
pass
'''
设置Token流当前的读取位置
'''
def set_position(self, position):
pass
simple_lexer.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from play_with_compiler.craft.base_type import Token, TokenReader, TokenType
from enum import Enum
'''
Token的一个简单实现。只有类型和文本值两个属性。
'''
class SimpleToken(Token):
def __init__(self):
self.token_type = None
self.token_text = ''
def get_type(self): # Token的类型
return self.token_type
def get_text(self): # Token的文本值
return self.token_text
'''
一个简单的Token流。是把一个Token列表进行了封装。
'''
class SimpleTokenReader(TokenReader):
def __init__(self, tokens):
self.tokens = tokens
self.pos = 0
def read(self):
if (self.pos < len(self.tokens)):
token = self.tokens[self.pos]
self.pos = self.pos + 1
return token
return None
def peek(self):
if (self.pos < len(self.tokens)):
return self.tokens[self.pos]
return None
def unread(self):
if (self.pos > 0):
self.pos = self.pos - 1
def get_position(self):
return self.pos
def set_position(self, position):
if (position >=0 and position < len(self.tokens)):
self.pos = position
'''
有限状态机的各种状态。
'''
class DfaState(Enum):
Initial = 0
If = 1
Id_if1 = 2
Id_if2 = 3
Else = 4
Id_else1 = 5
Id_else2 = 6
Id_else3 = 7
Id_else4 = 8
Int = 9
Id_int1 = 10
Id_int2 = 11
Id_int3 = 12
Id = 13
GT = 14
GE = 15
Assignment = 16
Plus = 17
Minus = 18
Star = 19
Slash = 20
SemiColon = 21
LeftParen = 22
RightParen = 23
IntLiteral = 24
'''
一个简单的手写的词法分析器。
能够为后面的简单计算器、简单脚本语言产生Token。
'''
class SimpleLexer(object):
def __init__(self):
self.token = SimpleToken() # 当前正在解析的Token
self.tokens = [] # 保存解析出来的Token
# 是否是字母
def is_alpha(self, ch):
return ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z'))
# 是否是数字
def is_digit(self, ch):
return (ch >= '0' and ch <= '9')
# 是否是空白字符
def is_blank(self, ch):
return ch == ' ' or ch == '\t' or ch == '\n'
def dump(self, tokenReader):
print('text\t\ttype')
token = tokenReader.read()
while (token != None):
print('{}\t\t{}'.format(token.token_text, token.token_type))
token = tokenReader.read()
'''
有限状态机进入初始状态。
这个初始状态其实并不做停留,它马上进入其他状态。
开始解析的时候,进入初始状态;某个Token解析完毕,也进入初始状态,在这里把Token记下来,然后建立一个新的Token。
'''
def init_token(self, ch):
if (len(self.token.token_text) > 0):
self.tokens.append(self.token)
self.token = SimpleToken()
new_state = DfaState.Initial
if (self.is_alpha(ch)): # 第一个字符是字母
if (ch == 'i'):
new_state = DfaState.Id_int1
else:
new_state = DfaState.Id # 进入Id状态
self.token.token_type = TokenType.Identifier
self.token.token_text += ch
elif (self.is_digit(ch)): # 第一个字符是数字
new_state = DfaState.IntLiteral
self.token.token_type = TokenType.IntLiteral
self.token.token_text += ch
elif (ch == '>'): # 第一个字符是>
new_state = DfaState.GT
self.token.token_type = TokenType.GT
self.token.token_text += ch
elif (ch == '+'):
new_state = DfaState.Plus
self.token.token_type = TokenType.Plus
self.token.token_text += ch
elif (ch == '-'):
new_state = DfaState.Minus
self.token.token_type = TokenType.Minus
self.token.token_text += ch
elif (ch == '*'):
new_state = DfaState.Star
self.token.token_type = TokenType.Star
self.token.token_text += ch
elif (ch == '/'):
new_state = DfaState.Slash
self.token.token_type = TokenType.Slash
self.token.token_text += ch
elif (ch == ';'):
new_state = DfaState.SemiColon
self.token.token_type = TokenType.SemiColon
self.token.token_text += ch
elif (ch == '('):
new_state = DfaState.LeftParen
self.token.token_type = TokenType.LeftParen
self.token.token_text += ch
elif (ch == ')'):
new_state = DfaState.RightParen
self.token.token_type = TokenType.RightParen
self.token.token_text += ch
elif (ch == '='):
new_state = DfaState.Assignment
self.token.token_type = TokenType.Assignment
self.token.token_text += ch
else:
new_state = DfaState.Initial # skip all unknown patterns
return new_state
'''
解析字符串,形成Token。
这是一个有限状态自动机,在不同的状态中迁移。
'''
def tokenize(self, code):
self.tokens = []
self.token = SimpleToken()
ich = 0
ch = 0
state = DfaState.Initial
while (ich < len(code)):
ch = code[ich]
if (state == DfaState.Initial):
state = self.init_token(ch) # 重新确定后续状态
elif (state == DfaState.Id):
if (self.is_alpha(ch) or self.is_digit(ch)):
self.token.token_text += ch # 保持标识符状态
else:
state = self.init_token(ch) # 退出标识符状态,并保存Token
elif (state == DfaState.GT):
if (ch == '='):
self.token.token_type = TokenType.GE # 转换成GE
state = DfaState.GE
self.token.token_text += ch
else:
state = self.init_token(ch) # 退出GT状态,并保存Token
elif (state in [DfaState.GE, DfaState.Assignment, DfaState.Plus, DfaState.Minus, DfaState.Star,
DfaState.Slash, DfaState.SemiColon, DfaState.LeftParen, DfaState.RightParen]):
state = self.init_token(ch) # 退出当前状态,并保存Token
elif (state == DfaState.IntLiteral):
if (self.is_digit(ch)):
self.token.token_text += ch # 继续保持在数字字面量状态
else:
state = self.init_token(ch) # 退出当前状态,并保存Token
elif (state == DfaState.Id_int1):
if (ch == 'n'):
state = DfaState.Id_int2
self.token.token_text += ch
elif (self.is_digit(ch) or self.is_alpha(ch)):
state = DfaState.Id # 切换回Id状态
self.token.token_text += ch
else:
state = self.init_token(ch)
elif (state == DfaState.Id_int2):
if (ch == 't'):
state = DfaState.Id_int3
self.token.token_text += ch
elif (self.is_digit(ch) or self.is_alpha(ch)):
state = DfaState.Id # 切换回id状态
self.token.token_text += ch
else:
state = self.init_token(ch)
elif (state == DfaState.Id_int3):
if (self.is_blank(ch)):
self.token.token_type = TokenType.Int
state = self.init_token(ch)
else:
state = DfaState.Id # 切换回Id状态
self.token.token_text += ch
ich = ich + 1
# 把最后一个token送进去
if (len(self.token.token_text) > 0):
self.init_token(ch)
return SimpleTokenReader(self.tokens)
test.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from simple_lexer import SimpleLexer
from simple_calculator import SimpleCalculator
from simple_parser import SimpleParser
def test_simple_lexer():
lexer = SimpleLexer()
script = "int age = 45;"
print("parse: {}".format(script))
tokenReader = lexer.tokenize(script)
lexer.dump(tokenReader)
# 测试inta的解析
script = "inta age = 45;"
print("\nparse: {}".format(script))
tokenReader = lexer.tokenize(script)
lexer.dump(tokenReader)
# 测试in的解析
script = "in age = 45;"
print("\nparse: {}".format(script))
tokenReader = lexer.tokenize(script)
lexer.dump(tokenReader)
# 测试>=的解析
script = "age >= 45;"
print("\nparse: {}".format(script))
tokenReader = lexer.tokenize(script)
lexer.dump(tokenReader)
# 测试>的解析
script = "age > 45;"
print("\nparse: {}".format(script))
tokenReader = lexer.tokenize(script)
lexer.dump(tokenReader)
if __name__ == '__main__':
test_simple_lexer()
结果:
python test.py
parse: int age = 45;
text type
int TokenType.Int
age TokenType.Identifier
= TokenType.Assignment
45 TokenType.IntLiteral
; TokenType.SemiColon
parse: inta age = 45;
text type
inta TokenType.Identifier
age TokenType.Identifier
= TokenType.Assignment
45 TokenType.IntLiteral
; TokenType.SemiColon
parse: in age = 45;
text type
in TokenType.Identifier
age TokenType.Identifier
= TokenType.Assignment
45 TokenType.IntLiteral
; TokenType.SemiColon
parse: age >= 45;
text type
age TokenType.Identifier
>= TokenType.GE
45 TokenType.IntLiteral
; TokenType.SemiColon
parse: age > 45;
text type
age TokenType.Identifier
> TokenType.GT
45 TokenType.IntLiteral
; TokenType.SemiColon
c++
craf.h
#ifndef _Craft_H_INCLUDED_
#define _Craft_H_INCLUDED_
#include <string>
#include <vector>
#include <queue>
using namespace std;
/**
* Token的类型
*/
enum class TokenType
{
Plus, // +
Minus, // -
Star, // *
Slash, // /
GE, // >=
GT, // >
EQ, // ==
LE, // <=
LT, // <
SemiColon, // ;
LeftParen, // (
RightParen,// )
Assignment,// =
If,
Else,
Int,
Identifier, // 标识符
IntLiteral, // 整型字面量
StringLiteral // 字符串字面量
};
std::ostream& operator << (std::ostream& os, TokenType t)
{
switch (t)
{
case TokenType::Plus: os << "Plus"; break; // +
case TokenType::Minus: os << "Minus"; break; // -
case TokenType::Star: os << "Star"; break; // *
case TokenType::Slash: os << "Slash"; break; // /
case TokenType::GE: os << "GE"; break; // >=
case TokenType::GT: os << "GT"; break; // >
case TokenType::EQ: os << "EQ"; break; // ==
case TokenType::LE: os << "LE"; break; // <=
case TokenType::LT: os << "LT"; break; // <
case TokenType::SemiColon: os << "SemiColon"; break; // ;
case TokenType::LeftParen: os << "LeftParen"; break; // (
case TokenType::RightParen: os << "RightParen"; break;// )
case TokenType::Assignment: os << "Assignment"; break;// =
case TokenType::If: os << "If"; break;
case TokenType::Else: os << "Else"; break;
case TokenType::Int: os << "Int"; break;
case TokenType::Identifier: os << "Identifier"; break; // 标识符
case TokenType::IntLiteral: os << "IntLiteral"; break; // 整型字面量
case TokenType::StringLiteral: os << "StringLiteral"; break; // 字符串字面量
default: os.setstate(std::ios_base::failbit); break;
}
return os;
}
/**
* 一个简单的Token。
* 只有类型和文本值两个属性。
*/
class Token
{
public:
virtual TokenType getType() = 0; // Token的类型
virtual string getText() = 0; // Token的文本值
virtual ~Token(){}
};
/**
* AST节点的类型。
*/
enum class ASTNodeType
{
Programm, //程序入口,根节点
IntDeclaration, //整型变量声明
ExpressionStmt, //表达式语句,即表达式后面跟个分号
AssignmentStmt, //赋值语句
Primary, //基础表达式
Multiplicative, //乘法表达式
Additive, //加法表达式
Identifier, //标识符
IntLiteral //整型字面量
};
/**
* AST的节点。
* 属性包括AST的类型、文本值、下级子节点和父节点
*/
class ASTNode
{
public:
virtual ASTNode* getParent() = 0; // 父节点
virtual vector<ASTNode*> getChildren() = 0; // 子节点
virtual ASTNodeType getType() = 0; // AST类型
virtual string getText() = 0; // 文本值
};
/**
* 一个Token流。由Lexer生成。Parser可以从中获取Token。
*/
class TokenReader
{
public:
/**
* 返回Token流中下一个Token,并从流中取出。 如果流已经为空,返回null;
*/
virtual Token* read() = 0;
/**
* 返回Token流中下一个Token,但不从流中取出。 如果流已经为空,返回null;
*/
virtual Token* peek() = 0;
/**
* Token流回退一步。恢复原来的Token。
*/
virtual void unread() = 0;
/**
* 获取Token流当前的读取位置。
*/
virtual int getPosition() = 0;
/**
* 设置Token流当前的读取位置
*/
virtual void setPosition(int position) = 0;
};
#endif /* _Craft_H_INCLUDED_ */
SimpleLexer.h
#ifndef _SimpleLexer_H_INCLUDED_
#define _SimpleLexer_H_INCLUDED_
#include <iostream>
#include "craft.h"
/**
* Token的一个简单实现。只有类型和文本值两个属性。
*/
class SimpleToken: public Token
{
public:
//Token类型
TokenType type;
//文本值
string text;
public:
TokenType getType()
{
return type;
}
string getText()
{
return text;
}
};
/**
* 一个简单的Token流。是把一个Token列表进行了封装。
*/
class SimpleTokenReader: public TokenReader
{
public:
vector<Token*> tokens;
int pos = 0;
SimpleTokenReader(vector<Token*> &tokens1)
{
tokens = tokens1;
}
Token* read()
{
if (pos < tokens.size())
{
return tokens[pos++];
}
return NULL;
}
Token* peek()
{
if (pos < tokens.size())
{
return tokens[pos];
}
return NULL;
}
void unread()
{
if (pos > 0)
{
pos--;
}
}
int getPosition()
{
return pos;
}
void setPosition(int position)
{
if (position >=0 && position < tokens.size())
{
pos = position;
}
}
};
/**
* 有限状态机的各种状态。
*/
enum class DfaState
{
Initial,
If, Id_if1, Id_if2, Else, Id_else1, Id_else2, Id_else3, Id_else4, Int, Id_int1, Id_int2, Id_int3, Id, GT, GE,
Assignment,
Plus, Minus, Star, Slash,
SemiColon,
LeftParen,
RightParen,
IntLiteral
};
/**
* 一个简单的手写的词法分析器。
* 能够为后面的简单计算器、简单脚本语言产生Token。
*/
class SimpleLexer
{
public:
vector<char> tokenText; //临时保存token的文本
vector<Token*> tokens; //保存解析出来的Token
SimpleToken *token = new SimpleToken; //当前正在解析的Token
~SimpleLexer()
{
tokenText.clear();
for(int i=0; i<tokens.size(); i++)
{
delete tokens[i];
}
tokens.clear();
delete token;
}
string vectorToString(vector<char> &v)
{
string result;
result.insert(result.begin(), v.begin(), v.end());
return result;
}
//是否是字母
bool isAlpha(int ch)
{
return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'));
}
//是否是数字
bool isDigit(int ch)
{
return (ch >= '0' && ch <= '9');
}
//是否是空白字符
bool isBlank(int ch)
{
return ch == ' ' || ch == '\t' || ch == '\n';
}
static void dump(SimpleTokenReader *tokenReader)
{
cout << "text\t\ttype" << endl;
Token *token = NULL;
//cout << "size: " << tokenReader->tokens.size() << endl;
while (token = tokenReader->read())
{
cout << token->getText() << "\t\t" << token->getType() << endl;
}
}
DfaState initToken(char ch);
SimpleTokenReader* tokenize(string code);
};
#endif /* _SimpleLexer_H_INCLUDED_ */
SimpleLexer.cpp
#include "SimpleLexer.h"
int main()
{
SimpleLexer *lexer = new SimpleLexer;
string script = "int age = 45;";
cout << "parse: " << script << endl;
SimpleTokenReader *tokenReader = lexer->tokenize(script);
lexer->dump(tokenReader);
//测试inta的解析
script = "inta age = 45;";
cout << "\nparse: " << script << endl;
tokenReader = lexer->tokenize(script);
lexer->dump(tokenReader);
//测试in的解析
script = "in age = 45;";
cout << "\nparse: " << script << endl;
tokenReader = lexer->tokenize(script);
lexer->dump(tokenReader);
//测试>=的解析
script = "age >= 45;";
cout << "\nparse: " << script << endl;
tokenReader = lexer->tokenize(script);
lexer->dump(tokenReader);
//测试>的解析
script = "age > 45;";
cout << "\nparse: " << script << endl;
tokenReader = lexer->tokenize(script);
lexer->dump(tokenReader);
delete lexer;
delete tokenReader;
return 0;
}
/**
* 有限状态机进入初始状态。
* 这个初始状态其实并不做停留,它马上进入其他状态。
* 开始解析的时候,进入初始状态;某个Token解析完毕,也进入初始状态,在这里把Token记下来,然后建立一个新的Token。
* @param ch
* @return
*/
DfaState SimpleLexer::initToken(char ch)
{
if (tokenText.size() > 0) {
token->text = vectorToString(tokenText);
tokens.push_back(token);
//cout << "53: token->text: " << token->text << endl;
tokenText.clear();
token = new SimpleToken();
}
DfaState newState = DfaState::Initial;
if (isAlpha(ch)) // 第一个字符是字母
{
if (ch == 'i')
{
newState = DfaState::Id_int1;
} else
{
newState = DfaState::Id; // 进入Id状态
}
token->type = TokenType::Identifier;
tokenText.push_back(ch);
}
else if (isDigit(ch)) // 第一个字符是数字
{
newState = DfaState::IntLiteral;
token->type = TokenType::IntLiteral;
tokenText.push_back(ch);
}
else if (ch == '>') // 第一个字符是>
{
newState = DfaState::GT;
token->type = TokenType::GT;
tokenText.push_back(ch);
}
else if (ch == '+')
{
newState = DfaState::Plus;
token->type = TokenType::Plus;
tokenText.push_back(ch);
}
else if (ch == '-')
{
newState = DfaState::Minus;
token->type = TokenType::Minus;
tokenText.push_back(ch);
}
else if (ch == '*')
{
newState = DfaState::Star;
token->type = TokenType::Star;
tokenText.push_back(ch);
}
else if (ch == '/')
{
newState = DfaState::Slash;
token->type = TokenType::Slash;
tokenText.push_back(ch);
}
else if (ch == ';')
{
newState = DfaState::SemiColon;
token->type = TokenType::SemiColon;
tokenText.push_back(ch);
}
else if (ch == '(')
{
newState = DfaState::LeftParen;
token->type = TokenType::LeftParen;
tokenText.push_back(ch);
}
else if (ch == ')')
{
newState = DfaState::RightParen;
token->type = TokenType::RightParen;
tokenText.push_back(ch);
}
else if (ch == '=')
{
newState = DfaState::Assignment;
token->type = TokenType::Assignment;
tokenText.push_back(ch);
}
else
{
newState = DfaState::Initial; // skip all unknown patterns
}
return newState;
}
/**
* 解析字符串,形成Token。
* 这是一个有限状态自动机,在不同的状态中迁移。
* @param code
* @return
*/
SimpleTokenReader* SimpleLexer::tokenize(string code)
{
tokens.clear();
tokenText.clear();
token = new SimpleToken;
int ich = 0;
char ch = 0;
DfaState state = DfaState::Initial;
while (ich < code.size())
{
ch = (char) code[ich];
switch (state)
{
case DfaState::Initial:
state = initToken(ch); //重新确定后续状态
break;
case DfaState::Id:
if (isAlpha(ch) || isDigit(ch))
{
tokenText.push_back(ch); //保持标识符状态
}
else
{
state = initToken(ch); //退出标识符状态,并保存Token
}
break;
case DfaState::GT:
if (ch == '=')
{
token->type = TokenType::GE; //转换成GE
state = DfaState::GE;
tokenText.push_back(ch);
}
else
{
state = initToken(ch); //退出GT状态,并保存Token
}
break;
case DfaState::GE:
case DfaState::Assignment:
case DfaState::Plus:
case DfaState::Minus:
case DfaState::Star:
case DfaState::Slash:
case DfaState::SemiColon:
case DfaState::LeftParen:
case DfaState::RightParen:
state = initToken(ch); //退出当前状态,并保存Token
break;
case DfaState::IntLiteral:
if (isDigit(ch))
{
tokenText.push_back(ch); //继续保持在数字字面量状态
}
else
{
state = initToken(ch); //退出当前状态,并保存Token
}
break;
case DfaState::Id_int1:
if (ch == 'n')
{
state = DfaState::Id_int2;
tokenText.push_back(ch);
}
else if (isDigit(ch) || isAlpha(ch))
{
state = DfaState::Id; //切换回Id状态
tokenText.push_back(ch);
}
else
{
state = initToken(ch);
}
break;
case DfaState::Id_int2:
if (ch == 't')
{
state = DfaState::Id_int3;
tokenText.push_back(ch);
}
else if (isDigit(ch) || isAlpha(ch))
{
state = DfaState::Id; //切换回id状态
tokenText.push_back(ch);
}
else
{
state = initToken(ch);
}
break;
case DfaState::Id_int3:
if (isBlank(ch))
{
token->type = TokenType::Int;
state = initToken(ch);
}
else{
state = DfaState::Id; //切换回Id状态
tokenText.push_back(ch);
}
break;
default:
break;
}
ich++;
}
// 把最后一个token送进去
if (tokenText.size() > 0)
{
initToken(ch);
}
return new SimpleTokenReader(tokens);
}
结果
$ g++ -std=c++11 SimpleLexer.cpp
$ ./a.out
parse: int age = 45;
text type
int Int
age Identifier
= Assignment
45 IntLiteral
; SemiColon
parse: inta age = 45;
text type
inta Identifier
age Identifier
= Assignment
45 IntLiteral
; SemiColon
parse: in age = 45;
text type
in Identifier
age Identifier
= Assignment
45 IntLiteral
; SemiColon
parse: age >= 45;
text type
age Identifier
>= GE
45 IntLiteral
; SemiColon
parse: age > 45;
text type
age Identifier
> GT
45 IntLiteral
; SemiColon
github: https://github.com/buyouran1/PlayWithCompiler
课程:https://time.geekbang.org/column/article/118378