Lan的源代码由一些基本元素构成,我们称之为Token,在词法分析阶段我们需要将输入的字符流转化成Token流(简单说就是Token列表)。
下面是Token的类型定义,为了节省资源采用整数表示而不用枚举类型。
public class TokenType {
public static final int PLUS = 0;//("+")
public static final int PLUSPLUS = 1;//("++")
public static final int MINUS = 2;//("-")
public static final int MINUSMINUS = 3;//("--")
public static final int ASTERISK = 4;//("*")
public static final int SLASH = 5;//("/")
public static final int PERCENT = 6;//("%")
public static final int EQUAL = 7;//("==")
public static final int NOT_EQUAL = 8;//("!=")
public static final int GT = 9;//(">")
public static final int GE = 10;//(">=")
public static final int LT = 11;//("<")
public static final int LE = 12;//("<=")
public static final int AND = 13;//("&&")
public static final int OR = 14;//("||")
public static final int BANG = 15;//("!")
public static final int LEFT_PAREN = 16;//("(")
public static final int RIGHT_PAREN = 17;//(")")
public static final int LEFT_BRACE = 18;//("{")
public static final int RIGHT_BRACE = 19;//("}")
public static final int COMMA = 20;//(",")
public static final int QUESTION = 21;//("?")
public static final int COLON = 22;//(":")
public static final int NUMBER = 23;//("数值")
public static final int STRING = 24;//("字符串")
public static final int ASSIGN = 25;//("=")
public static final int TRUE = 26;//("true")
public static final int FALSE = 27;//("false")
public static final int NULL = 28;//("null")
public static final int IDENTIFIER = 29;//("变量名")
public static final int IF = 30;//("if")
public static final int ELSE = 31;//("else")
public static final int WHILE = 32;//("while")
public static final int BREAK = 33;//("break")
public static final int CONTINUE = 34;//("continue")
public static final int PRINT = 35;//("print")
public static final int FUNC = 36;//("func")
public static final int RETURN = 37;//("return")
public static final int EOF = 38;//("末尾")
}
每种类型代表的内容看后面的注释即可,没有值得解释的内容。然后定义Token的结构。
public class Token {
public int type; //Token类型
public String symbol; //Token内容,TokenType类中的注释
public int line; //Token所在源代码的行号
public Token(int type, String symbol, int line) {
this.type = type;
this.symbol = symbol;
this.line = line;
}
}
最后就是词法分析器,我们称之为Lexer。注释部分已经解释得很清楚了,没有什么难度。
public class Lexer {
//关键字字典,每次从源码中取到符号后都要依此判断是否为关键字
private Map<String, Integer> keywordsFilter;
public Lexer() {
//初始化关键字字典
keywordsFilter = new HashMap<>();
keywordsFilter.put("true", TokenType.TRUE);
keywordsFilter.put("false", TokenType.FALSE);
keywordsFilter.put("null", TokenType.NULL);
keywordsFilter.put("if", TokenType.IF);
keywordsFilter.put("else", TokenType.ELSE);
keywordsFilter.put("while", TokenType.WHILE);
keywordsFilter.put("break", TokenType.BREAK);
keywordsFilter.put("continue", TokenType.CONTINUE);
keywordsFilter.put("print", TokenType.PRINT);
keywordsFilter.put("func", TokenType.FUNC);
keywordsFilter.put("return", TokenType.RETURN);
}
public List<Token> lex(String code) {
//该列表用于保存所有需要返回的Token
List<Token> tokens = new ArrayList<>();
//从源码中获取字符的索引
int index = 0;
//记录Token在源码中的行号
int currentLine = 1;
//源码的总字符长度
int codeLength = code.length();
while (index < codeLength) {
//取出下一个字符,并且将索引加1
char c = code.charAt(index++);
//如果是空格,回车,制表符号直接跳过并进入下一次循环
if (c == ' ' || c == '\r' || c == '\t') continue;
//如果是换行符则将当前行号加1并进入下一次循环
if (c == '\n') {
currentLine++;
continue;
}
if (c == '+') {
if (index < codeLength && code.charAt(index) == '+') {
index++;
tokens.add(new Token(TokenType.PLUSPLUS, "++", currentLine));
} else {
tokens.add(new Token(TokenType.PLUS, "+", currentLine));
}
} else if (c == '-') {
if (index < codeLength && code.charAt(index) == '-') {
index++;
tokens.add(new Token(TokenType.MINUSMINUS, "--", currentLine));
} else {
tokens.add(new Token(TokenType.MINUS, "-", currentLine));
}
} else if (c == '*') {
tokens.add(new Token(TokenType.ASTERISK, "*", currentLine));
} else if (c == '/') {
if (index < codeLength && code.charAt(index) == '/') {//忽略注释
do {
index++;
} while (index < codeLength && code.charAt(index) != '\n');
} else {
tokens.add(new Token(TokenType.SLASH, "/", currentLine));
}
} else if (c == '%') {
tokens.add(new Token(TokenType.PERCENT, "%", currentLine));
} else if (c == '(') {
tokens.add(new Token(TokenType.LEFT_PAREN, "(", currentLine));
} else if (c == ')') {
tokens.add(new Token(TokenType.RIGHT_PAREN, ")", currentLine));
} else if (c == '{') {
tokens.add(new Token(TokenType.LEFT_BRACE, "{", currentLine));
} else if (c == '}') {
tokens.add(new Token(TokenType.RIGHT_BRACE, "}", currentLine));
} else if (c == ',') {
tokens.add(new Token(TokenType.COMMA, ",", currentLine));
} else if (c == '?') {
tokens.add(new Token(TokenType.QUESTION, "?", currentLine));
} else if (c == ':') {
tokens.add(new Token(TokenType.COLON, ":", currentLine));
} else if (c == '>') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.GE, ">=", currentLine));
} else {
tokens.add(new Token(TokenType.GT, ">", currentLine));
}
} else if (c == '<') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.LE, "<=", currentLine));
} else {
tokens.add(new Token(TokenType.LT, "<", currentLine));
}
} else if (c == '!') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.NOT_EQUAL, "!=", currentLine));
} else {
tokens.add(new Token(TokenType.BANG, "!", currentLine));
}
} else if (c == '|') {
if (index < codeLength && code.charAt(index) == '|') {
index++;
tokens.add(new Token(TokenType.OR, "||", currentLine));
} else {
throw new RuntimeException("Lexer Error: expect '|'");
}
} else if (c == '&') {
if (index < codeLength && code.charAt(index) == '&') {
index++;
tokens.add(new Token(TokenType.AND, "&&", currentLine));
} else {
throw new RuntimeException("Lexer Error: expect '&'");
}
} else if (c == '=') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.EQUAL, "==", currentLine));
} else {
tokens.add(new Token(TokenType.ASSIGN, "=", currentLine));
}
} else if (Character.isDigit(c)) {//数字
int start = --index;
do {
if (++index >= code.length()) break;
c = code.charAt(index);
}
while (Character.isDigit(c));
tokens.add(new Token(TokenType.NUMBER, code.substring(start, index), currentLine));
} else if (Character.isAlphabetic(c)) {//符号
int start = --index;
do {
if (++index >= code.length()) break;
c = code.charAt(index);
}
while (Character.isAlphabetic(c));
String word = code.substring(start, index);
Integer type = keywordsFilter.get(word);
Token token = new Token(type == null ? TokenType.IDENTIFIER : type, word, currentLine);
tokens.add(token);
} else if (c == '"') {//字符串字面量
int start = index;
do {
if (index >= code.length()) break;
c = code.charAt(index++);
if (c == '\n') break;
}
while (c != '\"');
if (c != '\"') {
throw new RuntimeException("Lexer Error: expect \"");
}
String strLiteral = code.substring(start, index-1);
tokens.add(new Token(TokenType.STRING, strLiteral, currentLine));
}
else {
throw new RuntimeException(String.format("Lexer Error: unknown character \"%c\"", c));
}
}
tokens.add(new Token(TokenType.EOF, "", currentLine));
return tokens;
}
}
最后手动测试一下
public class Main {
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
Lexer lexer = new Lexer();
while (true) {
System.out.print(">>> ");
String code = scanner.nextLine();
if (code.equals(".q")) break;
List<Token> tokens = lexer.lex(code);
for (Token token : tokens) {
System.out.println(token.symbol);
}
}
}
}