Javac 词法分析
词法分析主要将 Java 源代码中的所有字符转变为 Token
Token类型
com.sun.tools.javac.parser.Tokens.TokenKind
EOF(),
ERROR(),
IDENTIFIER(Tag.NAMED), // 这个 Token 对象没有 name 值,用来泛指用户自定义的类名、包名、变量包、方法名等
ABSTRACT("abstract"),
ASSERT("assert", Tag.NAMED),
BOOLEAN("boolean", Tag.NAMED),
// ...
解析token的步骤
1.读取Java源文件到字符数组中.
2.生成token
demo
package com.sun.tools.javac.zc;
/**
* @author zhangcheng
* @date 2020/8/23
*/
public class HelloWorld {
/** javadoc comments */
public static void main(String[] args) {
String Abc;
String $a;
int i = 0x12;
int i2 = 0b1;
int i3 = 1;
long num4 = 1L;
// one line Comments
/* more line comments */
System.out.println("hello");
}
}
调用栈
scanChar:109, UnicodeReader (com.sun.tools.javac.parser)
readToken:473, JavaTokenizer (com.sun.tools.javac.parser)
nextToken:115, Scanner (com.sun.tools.javac.parser)
nextToken:297, JavacParser (com.sun.tools.javac.parser)
accept:485, JavacParser (com.sun.tools.javac.parser)
parseCompilationUnit:3051, JavacParser (com.sun.tools.javac.parser)
parse:636, JavaCompiler (com.sun.tools.javac.main)
parse:673, JavaCompiler (com.sun.tools.javac.main)
parseFiles:958, JavaCompiler (com.sun.tools.javac.main)
compile:865, JavaCompiler (com.sun.tools.javac.main)
compile:517, Main (com.sun.tools.javac.main)
compile:376, Main (com.sun.tools.javac.main)
compile:365, Main (com.sun.tools.javac.main)
compile:356, Main (com.sun.tools.javac.main)
compile:94, Main (com.sun.tools.javac)
run:235, JavacTool (com.sun.tools.javac.api)
main:10, T01Compiler (com.sun.tools.javac.zc)
从 Lexer 类型的 Scanner 对象中 tokenizer.readToken()
Scanner -> tokenizer -> UnicodeReader
public Token readToken() {
reader.sp = 0;
name = null;
radix = 0;
int pos = 0;
int endPos = 0; // 记录一个token在buf里的读取结束下标
List<Comment> comments = null;
try {
loop: while (true) {
pos = reader.bp; // 到这里 bp 和 pos 从0开始
switch (reader.ch) {
case ' ': // (Spec 3.6) 空格
case '\t': // (Spec 3.6) 水平制表符
case FF: // (Spec 3.6) 换行,换页符
do {
reader.scanChar();
} while (reader.ch == ' ' || reader.ch == '\t' || reader.ch == FF);
processWhiteSpace(pos, reader.bp);
break;
case LF: // (Spec 3.4) 换行符(\n)
reader.scanChar();
processLineTerminator(pos, reader.bp); // 只打debug日志
break;
case CR: // (Spec 3.4) 回车
reader.scanChar();
if (reader.ch == LF) { // 换行
reader.scanChar();
}
processLineTerminator(pos, reader.bp);
break; // 2.标识符的处理.Java标识符开始 只可能是大小写字母、下划线 与 $符
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case '$': case '_':
scanIdent(); // 扫描识别.获取标识符
break loop;
case '0': // 3.数字的处理
reader.scanChar();
if (reader.ch == 'x' || reader.ch == 'X') { // 处理十六进制表示的整数或浮点数
reader.scanChar();
skipIllegalUnderscores();
if (reader.ch == '.') {
scanHexFractionAndSuffix(pos, false);
} else if (reader.digit(pos, 16) < 0) {
lexError(pos, "invalid.hex.number");
} else { // 处理十六进制中的小数及后缀部分
scanNumber(pos, 16);
}
} else if (reader.ch == 'b' || reader.ch == 'B') { // 处理二进制表示的整数
if (!allowBinaryLiterals) {
lexError(pos, "unsupported.binary.lit", source.name);
allowBinaryLiterals = true;
}
reader.scanChar();
skipIllegalUnderscores();
if (reader.digit(pos, 2) < 0) {
lexError(pos, "invalid.binary.number");
} else {
scanNumber(pos, 2);
}
} else { // 处理八进制表示的整数
reader.putChar('0');
if (reader.ch == '_') {
int savePos = reader.bp;
do {
reader.scanChar();
} while (reader.ch == '_');
if (reader.digit(pos, 10) < 0) {
lexError(savePos, "illegal.underscore");
}
}
scanNumber(pos, 8);
}
break loop;
case '1': case '2': case '3': case '4': // 处理十进制表示的整数或浮点数
case '5': case '6': case '7': case '8': case '9':
scanNumber(pos, 10);
break loop;
case '.':
reader.scanChar();
if ('0' <= reader.ch && reader.ch <= '9') { // 处理十进制中的小数部分
reader.putChar('.');
scanFractionAndSuffix(pos); // 处理十进制中的小数及后缀部分
} else if (reader.ch == '.') { // 处理变长参数
int savePos = reader.bp;
reader.putChar('.'); reader.putChar('.', true);
if (reader.ch == '.') {
reader.scanChar();
reader.putChar('.');
tk = TokenKind.ELLIPSIS;
} else {
lexError(savePos, "illegal.dot");
}
} else { // 处理分隔符
tk = TokenKind.DOT;
}
break loop;
case ',': // 4.分隔符的处理
reader.scanChar(); tk = TokenKind.COMMA; break loop;
case ';':
reader.scanChar(); tk = TokenKind.SEMI; break loop;
case '(':
reader.scanChar(); tk = TokenKind.LPAREN; break loop;
case ')':
reader.scanChar(); tk = TokenKind.RPAREN; break loop;
case '[':
reader.scanChar(); tk = TokenKind.LBRACKET; break loop;
case ']':
reader.scanChar(); tk = TokenKind.RBRACKET; break loop;
case '{':
reader.scanChar(); tk = TokenKind.LBRACE; break loop;
case '}':
reader.scanChar(); tk = TokenKind.RBRACE; break loop;
case '/':
reader.scanChar();
if (reader.ch == '/') { // 单行注释
do {
reader.scanCommentChar();
} while (reader.ch != CR && reader.ch != LF && reader.bp < reader.buflen);
if (reader.bp < reader.buflen) {
comments = addComment(comments, processComment(pos, reader.bp, CommentStyle.LINE));
}
break;
} else if (reader.ch == '*') { // 多行注释或文档注释
boolean isEmpty = false;
reader.scanChar();
CommentStyle style;
if (reader.ch == '*') {
style = CommentStyle.JAVADOC;
reader.scanCommentChar();
if (reader.ch == '/') {
isEmpty = true;
}
} else {
style = CommentStyle.BLOCK;
}
while (!isEmpty && reader.bp < reader.buflen) {
if (reader.ch == '*') {
reader.scanChar();
if (reader.ch == '/') break;
} else {
reader.scanCommentChar();
}
}
if (reader.ch == '/') {
reader.scanChar();
comments = addComment(comments, processComment(pos, reader.bp, style));
break;
} else {
lexError(pos, "unclosed.comment");
break loop;
}
} else if (reader.ch == '=') { // 复合赋值运算符 "/="
tk = TokenKind.SLASHEQ;
reader.scanChar();
} else { // 除法运算符"/"
tk = TokenKind.SLASH;
}
break loop;
case '\'': // 6.单引号作为首字符的处理
reader.scanChar();
if (reader.ch == '\'') {
lexError(pos, "empty.char.lit");
} else {
if (reader.ch == CR || reader.ch == LF)
lexError(pos, "illegal.line.end.in.char.lit");
scanLitChar(pos); // 方法扫描字符常量
char ch2 = reader.ch;
if (reader.ch == '\'') {
reader.scanChar();
tk = TokenKind.CHARLITERAL;
} else {
lexError(pos, "unclosed.char.lit");
}
}
break loop;
case '\"':
reader.scanChar();
while (reader.ch != '\"' && reader.ch != CR && reader.ch != LF && reader.bp < reader.buflen)
scanLitChar(pos);
if (reader.ch == '\"') {
tk = TokenKind.STRINGLITERAL;
reader.scanChar();
} else {
lexError(pos, "unclosed.str.lit");
}
break loop;
default:
if (isSpecial(reader.ch)) {
scanOperator();
} else {
boolean isJavaIdentifierStart;
if (reader.ch < '\u0080') {
// all ASCII range chars already handled, above
isJavaIdentifierStart = false;
} else {
char high = reader.scanSurrogates();
if (high != 0) {
reader.putChar(high);
isJavaIdentifierStart = Character.isJavaIdentifierStart(
Character.toCodePoint(high, reader.ch));
} else {
isJavaIdentifierStart = Character.isJavaIdentifierStart(reader.ch);
}
}
if (isJavaIdentifierStart) {
scanIdent();
} else if (reader.bp == reader.buflen || reader.ch == EOI && reader.bp + 1 == reader.buflen) { // JLS 3.5
tk = TokenKind.EOF;
pos = reader.buflen;
} else {
String arg = (32 < reader.ch && reader.ch < 127) ?
String.format("%s", reader.ch) :
String.format("\\u%04x", (int)reader.ch);
lexError(pos, "illegal.char", arg);
reader.scanChar();
}
}
break loop;
}
}
endPos = reader.bp;
switch (tk.tag) { // 创建不同类型的token
case DEFAULT: return new Token(tk, pos, endPos, comments);
case NAMED: return new NamedToken(tk, pos, endPos, name, comments);
case STRING: return new StringToken(tk, pos, endPos, reader.chars(), comments);
case NUMERIC: return new NumericToken(tk, pos, endPos, reader.chars(), radix, comments);
default: throw new AssertionError();
}
}
finally {
if (scannerDebug) {
System.out.println("nextToken(" + pos
+ "," + endPos + ")=|" +
new String(reader.getRawCharacters(pos, endPos))
+ "|");
}
}
}
UnicodeReader成员变量
protected char[] buf;
protected int bp; // 记录当前读取的buf数组里的字符下标,从-1开始
protected final int buflen; // buf的长度,也就是Java源文件字符的长度
/**
The current character.保存当前从buf数组里读取出来的字符
*/
protected char ch;
/** A character buffer for saved chars.保存当前读取的字符数组,临时用来存储从buf数组中读出来的若干个字符
*/
protected char[] sbuf = new char[128];
protected int sp; // sbuf的当前下标
buf[]数组: 字符数组,保存输入的buf,比如整个HelloWorld.java文件的字符数组.最后一个数组元素的值为EOI,表示已经没有可读取的字符.
然后每次读取token时,从buf数组读取到sbuf里,重复利用sbuf.
标识符解析
com.sun.tools.javac.parser.JavaTokenizer#scanIdent
private void scanIdent() {
boolean isJavaIdentifierPart; // 是否是Java标识符的一部分.若为false,标识一个标识符解析完了.比如package等等
char high;
reader.putChar(true); // 保存字符到 sbuf里,并继续读取
do {
switch (reader.ch) { // reader.buffer 里存储了Java文件每个字符的 字符数组 char[]
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case '$': case '_':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
case '\u0000': case '\u0001': case '\u0002': case '\u0003':
case '\u0004': case '\u0005': case '\u0006': case '\u0007':
case '\u0008': case '\u000E': case '\u000F': case '\u0010':
case '\u0011': case '\u0012': case '\u0013': case '\u0014':
case '\u0015': case '\u0016': case '\u0017':
case '\u0018': case '\u0019': case '\u001B':
case '\u007F':
reader.scanChar();
continue;
case '\u001A': // EOI is also a legal identifier part
if (reader.bp >= reader.buflen) { // 已经没有待处理的字符
name = reader.name();
tk = tokens.lookupKind(name);
return;
}
reader.scanChar();
continue;
default:
if (reader.ch < '\u0080') { // 默认分支中,当ch小于'\u0080',即128时,表示这个字符已经不是合法标识符的一部分了,因为默认分支之前的分支已经处理了所有为合法标识符首字符的情况,如果再出现ASCII编码中的字符就能确定不是合法标识符的一部分了。isJavaIdentifierPart被赋值为false,这样sbuf数组中存储的字符就会形成一个Token对象。
// all ASCII range chars already handled, above
isJavaIdentifierPart = false; // 比如空格( ),点(.),分号(;) 都会走到这
} else {
if (Character.isIdentifierIgnorable(reader.ch)) {
reader.scanChar();
continue;
} else {
high = reader.scanSurrogates();
if (high != 0) {
reader.putChar(high);
isJavaIdentifierPart = Character.isJavaIdentifierPart(
Character.toCodePoint(high, reader.ch));
} else {
isJavaIdentifierPart = Character.isJavaIdentifierPart(reader.ch);
}
}
}
if (!isJavaIdentifierPart) { // 一个Java标识符已经解析完了
name = reader.name(); // 获取sbuf里暂存的所有字符,即一个token.比如package
tk = tokens.lookupKind(name); // 查找该token对应的类型
return;
}
}
reader.putChar(true); // 保存当前字符到 sbuf里,并继续读取 buf 数组里的字符
} while (true); // 循环
}
解析自定义的变量,java标识符等.当出现比如空格( ),点(.),分号(😉 等,就能拆分出一个token,并通过reader.name()获取token名称,通过tokens.lookupKind(name)保存到tk对象中.