今天我们接着上一篇,说明词法分析的实现
首先要说明的是我们使用C#语言来作为开发语言,虽然是C#语言,我会尽量避免使用C#的特性,以便于移植使用其它 语言
要实现一下编译器(解释器)这样一个任务很复杂,但如同做任何其它复杂任务一样,我们可以把一个复杂任务分成几个简单任务,然后再把每一个简单任务分成一个个更小的任务 ,上一篇我们我们讲解了我们的实现步骤分为 词法分析->语法分析->语义分析->执行,现在我们就来实现 词法分析 的功能
假设我们的程序如下
var age;
fun main(){
age=0;
}
词法分析的任务就是要能分解成如下的内容
var | age | ; | fun | main | ( | ) | { | age | = | 0 | ; | } |
这里的每一个部分称为一个单词(不是英文中的单词的意义)为了与英文单词这里的单词有一个专有的名称———Token
一个Token就是由程序字符串分离出的 用来组成 语法树的基本单元。
实际实现的时候Token都会设置一些属性字字段,主要的有Token类型,Token的值 Token在文件中的行列号等
下面是我们的Token类型定义
public enum TokenType{
DEFAULT=0,
COMMENT=1,
EOF,
INT,
REAL,
STR,
CHAR,
ID,
AND,
OR,
ASSIGN,
NOT,
BIT_NOT,
DOT,
DOTDOT,
ELLIPSIS,
ADD,
SUB,
STAR,
DIV,
MOD,
BITAND,
BITOR,
BITXOR,
INCREASE,
DECREASE,
ADD_ASSIGN,
SUB_ASSIGN,
MUL_ASSIGN,
DIV_ASSIGN,
MOD_ASSIGN,
BITAND_ASSIGN,
BITOR_ASSIGN,
BITXOR_ASSIGN,
LEFT_MOVE,
RIGHT_MOVE,
LEFT_MOVE_ASSIGN,
RIGHT_MOVE_ASSIGN,
EQUAL,
NOTEQUAL,
BIGGER,
LESS,
BIGGER_EQUAL,
LESS_EQUAL,
LEFT_PARA,
RIGHT_PARA,
LEFT_SQUARE,
RIGHT_SQUARE,
BEGIN,
END,
COMMA,
COLON,
SEMICOLON,
QUESTIOIN_SIGN,
POINTER_TO,
}
DEFAULT 类型用于辅助词法分析的实现,后面会讲解,COMMENT表示注释类型(包括行注释和块注释),EOF表示 end of file 当没有其它Token时会给出一个类型为EOF的Token,INT 表示整数 REAL表示实数(我们作为double 来处理)STR 表示字符串,CHAR 表示字符,ID表示标识符 ,其它表示各种符号类型,仍然以上面的程序为例子,第二行表示各个Token的类型:
var | age | ; | fun | main | ( | ) | { | age | = | 0 | ; | } |
ID | ID | SEMICOLON | ID | ID | LEFT_PARA | RIGHT_PARA | BEGIN | ID | ASSIGN | INT | SEMICOLON | END |
需要注意的的是 var 和 fun是关键字,我们也作为ID来处理,需要判断是否为关键字时不仅要判断Token的类型是否为ID ,还要判断其字符串是否为var 或fun
下面是Token类的定义:
public class Token
{
public Token(Lexer lex,TokenType tokenType,int LineNumber,int Col)
{
this.tokenType = tokenType;
this.LineNumber = LineNumber;
this.Col = Col;
this.EndLineNumber = LineNumber;
this.EndCol = Col;
this.OrigStr = this.identifier = "";
this.int_val = 0;
this.real_val = 0.0;
this.str_value = "";
this.lex = lex;
}
public int LineNumber;
public int Col;
public int EndLineNumber;
public int EndCol;
public Lexer lex;
public TokenType tokenType;
public string OrigStr;
public int int_val;
public double real_val;
public string str_value;
public string identifier;
}
不再详细解释
关于词法分析的理论这里不做介绍,大家可以自己去查阅,分析方法有 查表法,手工编写代码,或用工具生成(lex,flex,jacc等)
我们不使用任何第三方工具,但如果单纯编码来做,代码又比较长,这里使用一个小技巧,首先识别是否为数字 ,字符串,字符 ,标识符,如果都不是,就当做符号来识别,否则就表示程序有错误。在符号识别部分,我们构建一个数组表来辅助分析,这样可将代码缩短一些。
下面是词法分析的代码:
public class Lexer
{
char [] programContent;
long idx = 0;
char EOF = '\0';
public int LineNumber = 1;
public int Col = 1;
public string fileName = "lexer";
public Lexer(string program)
{
// TODO: Complete member initialization
this.programContent = program.ToCharArray();
LineNumber = 1;
Col = 0;
idx = 0;
}
public void Close()
{
//sr.Close();
}
char readChar()
{
var buffer = new char[1];
var ret = 0;
if (programContent.Length>idx)
{
ret = 1;
buffer[0] = programContent[idx++];
}
if (ret <= 0) return EOF;
if (buffer[0] == '\n')
{
LineNumber++;
Col = -1;
}
Col++;
return buffer[0];
}
List<char> bufChar = new List<char>();
public virtual char nextChar()
{
if (bufChar.Count > 0)
{
var t = bufChar.First();
bufChar.RemoveAt(0);
return t;
}
return readChar();
}
public char LookaheadChar(int num)
{
while (bufChar.Count < num)
{
var ch = readChar();
if (ch ==EOF) break;
bufChar.Add(ch);
}
if (bufChar.Count >= num)//有可能走到文件末尾也未能拿到向前查看num个数的Char,所以这个判断是必要的
return bufChar[num - 1];
return EOF;
}
bool isCommentOrBlankChar(char ch)
{
var ret= (ch == '\r' || ch == '\n' || ch == '\t' || ch == ' ' || ch == '\f' || ch == '/');
return ret;
}
public List<Token> commentTokenList = new List<Token>();
void strip()
{
while (isCommentOrBlankChar(LookaheadChar(1)))
{
if (LookaheadChar(1) != '/'){
var ch= nextChar();//相当于skip
continue;
}
var chNext = LookaheadChar(2);
if (chNext == '/')//row comment
{
var tokenComment = new Token(this, TokenType.COMMENT, this.LineNumber, this.Col-1);
this.commentTokenList.Add(tokenComment);
while (LookaheadChar(1) != '\n')
{
tokenComment.EndCol = this.Col;
if (nextChar() == EOF) { tokenComment.EndCol++; return; }
}
skip('\n');
tokenComment.EndCol++;
}
else if (chNext == '*')//block comment
{
var tokenComment = new Token(this, TokenType.COMMENT, this.LineNumber, this.Col - 1);
this.commentTokenList.Add(tokenComment);
skip('/'); skip('*');
while (true)
{
while (LookaheadChar(1) != '*')
{
if (nextChar() == EOF)
{ tokenComment.EndCol = this.Col; return; }
}
if (LookaheadChar(2) == '/')//block coment end;
{
skip('*'); skip('/');
break;
}
skip('*');
}
tokenComment.EndLineNumber = this.LineNumber;
tokenComment.EndCol = this.Col;
}
else
break;//如果不是注释符号,就跳出来
}
}
void skip(char ch)
{
var temp = nextChar();
if (ch != temp)
AST.exitMsg(string.Format("expect:'{3}' in file {0} {1}:{2}", this.fileName, this.LineNumber, this.Col,ch));
}
public List<Token> stringLiteralList = new List<Token>();
public Token nextToken()
{
strip();
char ch = LookaheadChar(1);
Token t;
if (ch == EOF)
t= new Token(this, TokenType.EOF, this.LineNumber, this.Col);
else if (ch == '\"')
{
t = new Token(this, TokenType.STR, this.LineNumber, this.Col);
t.OrigStr =nextChar().ToString();
while (this.LookaheadChar(1) != '\"')
t.str_value += GetStrChar(t, '\"');
skip('\"');
t.OrigStr += "\"";
this.stringLiteralList.Add(t);
}
else if (ch == '\'')
{
t = new Token(this, TokenType.CHAR, this.LineNumber, this.Col);
t.OrigStr = nextChar().ToString();
ch = GetStrChar(t, '\'');
t.int_val = (int)ch;
skip('\'');
t.OrigStr += "'";
}
else if (isDigital(ch))
t = LexNumToken(nextChar());
else if (isLetter(ch) || ch == '_')
{
t = new Token(this, TokenType.ID, this.LineNumber, this.Col);
while (isIdentifierChar(LookaheadChar(1)))
t.identifier += nextChar();
t.OrigStr = t.identifier;
}else
t = getTokenBySignChar(ch);
t.EndLineNumber = this.LineNumber;
t.EndCol = this.Col;
return t;
}
List<List<int>> OperateArr = new List<List<int>>{
new List<int>{ (int)TokenType.DEFAULT,'(', (int)TokenType.LEFT_PARA, 1},
new List<int>{ (int)TokenType.DEFAULT,')', (int)TokenType.RIGHT_PARA, 1},
new List<int>{ (int)TokenType.DEFAULT,'[', (int)TokenType.LEFT_SQUARE, 1},
new List<int>{ (int)TokenType.DEFAULT,']', (int)TokenType.RIGHT_SQUARE, 1},
new List<int>{ (int)TokenType.DEFAULT,'{', (int)TokenType.BEGIN, 1},
new List<int>{ (int)TokenType.DEFAULT,'}', (int)TokenType.END, 1},
new List<int>{ (int)TokenType.DEFAULT,';', (int)TokenType.SEMICOLON, 1},
new List<int>{ (int)TokenType.DEFAULT,'?', (int)TokenType.QUESTIOIN_SIGN,1},
new List<int>{ (int)TokenType.DEFAULT,':', (int)TokenType.COLON, 1},
new List<int>{ (int)TokenType.DEFAULT,'^', (int)TokenType.BITXOR, 2},
new List<int>{ (int)TokenType.BITXOR, '=', (int)TokenType.BITXOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'~', (int)TokenType.BIT_NOT, 2},
new List<int>{ (int)TokenType.BIT_NOT,'=', (int)TokenType.BITOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'+', (int)TokenType.ADD, 2},
new List<int>{ (int)TokenType.ADD, '+', (int)TokenType.INCREASE, 1},
new List<int>{ (int)TokenType.ADD, '=', (int)TokenType.ADD_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'-', (int)TokenType.SUB, 2},
new List<int>{ (int)TokenType.SUB, '-', (int)TokenType.DECREASE, 1},
new List<int>{ (int)TokenType.SUB, '=', (int)TokenType.SUB_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'*', (int)TokenType.STAR, 2},
new List<int>{ (int)TokenType.STAR, '=', (int)TokenType.MUL_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'/', (int)TokenType.DIV, 2},
new List<int>{ (int)TokenType.DIV, '=', (int)TokenType.DIV_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'%', (int)TokenType.MOD, 2},
new List<int>{ (int)TokenType.MOD, '=', (int)TokenType.MOD_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'=', (int)TokenType.ASSIGN, 2},
new List<int>{ (int)TokenType.ASSIGN,'=', (int)TokenType.EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'!', (int)TokenType.NOT, 2},
new List<int>{ (int)TokenType.NOT, '=', (int)TokenType.NOTEQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'&', (int)TokenType.BITAND, 2},
new List<int>{ (int)TokenType.BITAND, '&', (int)TokenType.AND, 1},
new List<int>{ (int)TokenType.BITAND, '=', (int)TokenType.BITAND_ASSIGN,1},
new List<int>{ (int)TokenType.DEFAULT,'|', (int)TokenType.BITOR, 2},
new List<int>{ (int)TokenType.BITOR, '|', (int)TokenType.OR, 1},
new List<int>{ (int)TokenType.BITOR, '=', (int)TokenType.BITOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'>', (int)TokenType.BIGGER, 2},
new List<int>{ (int)TokenType.BIGGER, '>', (int)TokenType.RIGHT_MOVE, 2},
new List<int>{ (int)TokenType.RIGHT_MOVE,'=', (int)TokenType.RIGHT_MOVE_ASSIGN,1},
new List<int>{ (int)TokenType.BIGGER, '=', (int)TokenType.BIGGER_EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'<', (int)TokenType.LESS, 2},
new List<int>{ (int)TokenType.LESS, '<', (int)TokenType.LEFT_MOVE, 2},
new List<int>{ (int)TokenType.LEFT_MOVE,'=', (int)TokenType.LEFT_MOVE_ASSIGN,1},
new List<int>{ (int)TokenType.LESS, '=', (int)TokenType.LESS_EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'.', (int)TokenType.DOT,2},
new List<int>{ (int)TokenType.DOT,'.', (int)TokenType.DOTDOT,3},
new List<int>{ (int)TokenType.DOTDOT,'.', (int)TokenType.ELLIPSIS,1}
};
Token getTokenBySignChar(char ch)
{
var lastState = 0;
var token = new Token(this,TokenType.DEFAULT, this.LineNumber, this.Col);
var timesAfterTwo = 0;
foreach (var item in OperateArr)
{
if (timesAfterTwo > 0)
if (++timesAfterTwo > 3) break;
if (!(item[0] == (int)token.tokenType && ((char)item[1]) == ch)) continue;
token.OrigStr += ch;
skip(ch);
ch = LookaheadChar(1);
token.tokenType = (TokenType)item[2];
lastState = item[3];
if (lastState == 1) break;
if (lastState == 2 && timesAfterTwo==0)
timesAfterTwo = 1;
}
if (lastState == 3 || lastState==0) AST.exitMsg(string.Format("Invalid char:{0} {1}", this.LineNumber, this.Col));
token.EndLineNumber = this.LineNumber;
token.EndCol = this.Col;
return token;
}
private char GetStrChar(Token t,char shiftCh)
{
char ch = nextChar();
t.OrigStr+=ch.ToString();
if (ch != '\\') return ch;
ch = nextChar();
if (ch == '0')
{
var strAr = numStr(ch).Split('_');
var baseNumStr = strAr[0];
t.OrigStr = strAr[1];
var realNumStr = baseNumStr == "16" ? t.OrigStr.Substring(2) : t.OrigStr;
t.OrigStr = "\\" + t.OrigStr;
return (char)Convert.ToInt32(realNumStr, int.Parse(baseNumStr));
}
t.OrigStr += ch.ToString();
switch (ch)
{
case 'a':
return '\a';
case 'b':
return '\b';
case 't':
return '\t';
case 'r':
return '\r';
case 'n':
return '\n';
case 'v':
return '\v';
case 'f':
return '\f';
case '\\':
return '\\';
default:
if (ch == shiftCh) return shiftCh;
else if (ch >= '!' && ch <= '~')
throw new Exception(string.Format("非法转义字符: \'\\{0}\'", (char)ch));
else
throw new Exception(string.Format("非法转义字符: \'\\0x{0:X}\'", (int)ch));
}
}
string numStr(char ch)
{
var baseNum = 10;
var str = ch.ToString();
char chNext = LookaheadChar(1);
if (ch == '0')//以0开头的数字有三种情况 1.单独一个0 2.0x...16进制数 8.0...8进制数
{
if (chNext == 'x' || chNext == 'X')//16进制
{
skip(chNext);
str += chNext.ToString();
baseNum = 16;
}
else if (isDigital(chNext))//下一位是数字,就是8进制数了
{
baseNum = 8;
}
else
{//否则那就是单独一个0
return "10_"+str;
}
}
while (isBaseNumLetter(LookaheadChar(1),baseNum))
str += nextChar();
return baseNum.ToString("d2")+"_"+ str;
}
private Token LexNumToken(char ch)
{
var t = new Token(this,TokenType.INT, this.LineNumber, this.Col);
var strAr = numStr(ch).Split('_');
var baseNumStr = strAr[0];
t.OrigStr = strAr[1];
var realNumStr = t.OrigStr;// baseNumStr == "16" ? t.OrigStr.Substring(2) : t.OrigStr;
t.int_val = Convert.ToInt32(realNumStr, int.Parse(baseNumStr));
return t;
}
bool isIdentifierChar(char ch)
{
return isLetter(ch) || ch == '_' || isDigital(ch);
}
int getNumFromChar(char ch)
{
if (isDigital(ch))
return ch - '0';
if (ch >= 'a' && ch <= 'f')
return ch - 'a' + 10;
if (ch >= 'A' && ch <= 'F')
return ch - 'A' + 10;
throw new Exception("8进制或16进制数据错误!");
}
private bool isBaseNumLetter(char ch, int baseNum)
{
if (isDigital(ch))
return getNumFromChar(ch) < baseNum;
if (ch >= 'a' && ch <= 'f')
return getNumFromChar(ch) < baseNum;
if(ch >= 'A' && ch <= 'F')
return getNumFromChar(ch) < baseNum;
return false;
}
private bool isLetter(char ch)
{
if (ch >= 'a' && ch <= 'z')
return true;
return (ch >= 'A' && ch <= 'Z');
}
private bool isDigital(char ch)
{
return (ch >= '0' && ch <= '9');
}
public static void testPrint(string program)
{
var l=new Lexer(program);
while (true)
{
var t = l.nextToken();
Console.WriteLine(t.tokenType.ToString().PadRight(18)+":"+ t.OrigStr);
if (t.tokenType == TokenType.EOF) break;
}
}
}
每次调用nextToken()方法返回一个Token,直到返回类型为EOF 的Token,说明词法分析已完成 ,值得注意的是在nextToken()方法的开始要先调用strip()方法,strip()方法负责把无效的字符去除掉,如 '\r','\n','\t','空格’等,注释也放在这个方法来处理
在去除了无效的字符之后,如果第一个字符是双引号 表示这时一个字符串,如果第一个字符是单引号 表示这时一个字符,如果第一个字符是数字 表示这时一个INT 或 REAL ,如果第一个字符是字母或下划线 表示这时一个标识符,如时以上都不是,就一定是一个符号Token,我们就调用getTokenBySignChar 方法来处理
getTokenBySignChar方法应该算相对比较难理解的部分了,
我们先来说明 OperateArr数组(这里用List来写,其实跟数组一样的,在C语言中可以写二维数组)
new List<int>{ (int)TokenType.DEFAULT,'(', (int)TokenType.LEFT_PARA, 1}
第一项表示当前所处的状态(默认是DEFAULT),
第二项表示即将扫描的字符,
第三项表示如果当前状态是第一项指定的状态,且扫描的字符是第二项指定的字符,那么将状态转换到这一项指定的状态,
第四项 表示是否要继续扫描 1 扫描完成 2继续扫描,3继续扫描,为了提高速度,我们在遇到 2时最多继续扫描两次(请自行分析能这样的原因)
最后来说一下测试方法
SimpleC.Lexer.testPrint(programStr);
假如programStr为如下程序:
var age=30;
fun main(){
age=26;
}
输出如下就表示成功:
ID :var
ID :age
ASSIGN :=
INT :30
SEMICOLON :;
ID :fun
ID :main
LEFT_PARA :(
RIGHT_PARA :)
BEGIN :{
ID :age
ASSIGN :=
INT :26
SEMICOLON :;
END :}
EOF :
下面再赋一下Lexer 类的完整代码
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace SimpleC
{
public class Lexer
{
char [] programContent;
long idx = 0;
char EOF = '\0';
public int LineNumber = 1;
public int Col = 1;
public string fileName = "lexer";
public Lexer(string program)
{
// TODO: Complete member initialization
this.programContent = program.ToCharArray();
LineNumber = 1;
Col = 0;
idx = 0;
}
public void Close()
{
//sr.Close();
}
char readChar()
{
var buffer = new char[1];
var ret = 0;
if (programContent.Length>idx)
{
ret = 1;
buffer[0] = programContent[idx++];
}
if (ret <= 0) return EOF;
if (buffer[0] == '\n')
{
LineNumber++;
Col = -1;
}
Col++;
return buffer[0];
}
List<char> bufChar = new List<char>();
public virtual char nextChar()
{
if (bufChar.Count > 0)
{
var t = bufChar.First();
bufChar.RemoveAt(0);
return t;
}
return readChar();
}
public char LookaheadChar(int num)
{
while (bufChar.Count < num)
{
var ch = readChar();
if (ch ==EOF) break;
bufChar.Add(ch);
}
if (bufChar.Count >= num)//有可能走到文件末尾也未能拿到向前查看num个数的Char,所以这个判断是必要的
return bufChar[num - 1];
return EOF;
}
bool isCommentOrBlankChar(char ch)
{
var ret= (ch == '\r' || ch == '\n' || ch == '\t' || ch == ' ' || ch == '\f' || ch == '/');
return ret;
}
public List<Token> commentTokenList = new List<Token>();
void strip()
{
while (isCommentOrBlankChar(LookaheadChar(1)))
{
if (LookaheadChar(1) != '/'){
var ch= nextChar();//相当于skip
continue;
}
var chNext = LookaheadChar(2);
if (chNext == '/')//row comment
{
var tokenComment = new Token(this, TokenType.COMMENT, this.LineNumber, this.Col-1);
this.commentTokenList.Add(tokenComment);
while (LookaheadChar(1) != '\n')
{
tokenComment.EndCol = this.Col;
if (nextChar() == EOF) { tokenComment.EndCol++; return; }
}
skip('\n');
tokenComment.EndCol++;
}
else if (chNext == '*')//block comment
{
var tokenComment = new Token(this, TokenType.COMMENT, this.LineNumber, this.Col - 1);
this.commentTokenList.Add(tokenComment);
skip('/'); skip('*');
while (true)
{
while (LookaheadChar(1) != '*')
{
if (nextChar() == EOF)
{ tokenComment.EndCol = this.Col; return; }
}
if (LookaheadChar(2) == '/')//block coment end;
{
skip('*'); skip('/');
break;
}
skip('*');
}
tokenComment.EndLineNumber = this.LineNumber;
tokenComment.EndCol = this.Col;
}
else
break;//如果不是注释符号,就跳出来
}
}
void skip(char ch)
{
var temp = nextChar();
if (ch != temp)
AST.exitMsg(string.Format("expect:'{3}' in file {0} {1}:{2}", this.fileName, this.LineNumber, this.Col,ch));
}
public List<Token> stringLiteralList = new List<Token>();
public Token nextToken()
{
strip();
char ch = LookaheadChar(1);
Token t;
if (ch == EOF)
t= new Token(this, TokenType.EOF, this.LineNumber, this.Col);
else if (ch == '\"')
{
t = new Token(this, TokenType.STR, this.LineNumber, this.Col);
t.OrigStr =nextChar().ToString();
while (this.LookaheadChar(1) != '\"')
t.str_value += GetStrChar(t, '\"');
skip('\"');
t.OrigStr += "\"";
this.stringLiteralList.Add(t);
}
else if (ch == '\'')
{
t = new Token(this, TokenType.CHAR, this.LineNumber, this.Col);
t.OrigStr = nextChar().ToString();
ch = GetStrChar(t, '\'');
t.int_val = (int)ch;
skip('\'');
t.OrigStr += "'";
}
else if (isDigital(ch))
t = LexNumToken(nextChar());
else if (isLetter(ch) || ch == '_')
{
t = new Token(this, TokenType.ID, this.LineNumber, this.Col);
while (isIdentifierChar(LookaheadChar(1)))
t.identifier += nextChar();
t.OrigStr = t.identifier;
}else
t = getTokenBySignChar(ch);
t.EndLineNumber = this.LineNumber;
t.EndCol = this.Col;
return t;
}
List<List<int>> OperateArr = new List<List<int>>{
new List<int>{ (int)TokenType.DEFAULT,'(', (int)TokenType.LEFT_PARA, 1},
new List<int>{ (int)TokenType.DEFAULT,')', (int)TokenType.RIGHT_PARA, 1},
new List<int>{ (int)TokenType.DEFAULT,'[', (int)TokenType.LEFT_SQUARE, 1},
new List<int>{ (int)TokenType.DEFAULT,']', (int)TokenType.RIGHT_SQUARE, 1},
new List<int>{ (int)TokenType.DEFAULT,'{', (int)TokenType.BEGIN, 1},
new List<int>{ (int)TokenType.DEFAULT,'}', (int)TokenType.END, 1},
new List<int>{ (int)TokenType.DEFAULT,';', (int)TokenType.SEMICOLON, 1},
new List<int>{ (int)TokenType.DEFAULT,'?', (int)TokenType.QUESTIOIN_SIGN,1},
new List<int>{ (int)TokenType.DEFAULT,':', (int)TokenType.COLON, 1},
new List<int>{ (int)TokenType.DEFAULT,'^', (int)TokenType.BITXOR, 2},
new List<int>{ (int)TokenType.BITXOR, '=', (int)TokenType.BITXOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'~', (int)TokenType.BIT_NOT, 2},
new List<int>{ (int)TokenType.BIT_NOT,'=', (int)TokenType.BITOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'+', (int)TokenType.ADD, 2},
new List<int>{ (int)TokenType.ADD, '+', (int)TokenType.INCREASE, 1},
new List<int>{ (int)TokenType.ADD, '=', (int)TokenType.ADD_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'-', (int)TokenType.SUB, 2},
new List<int>{ (int)TokenType.SUB, '-', (int)TokenType.DECREASE, 1},
new List<int>{ (int)TokenType.SUB, '=', (int)TokenType.SUB_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'*', (int)TokenType.STAR, 2},
new List<int>{ (int)TokenType.STAR, '=', (int)TokenType.MUL_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'/', (int)TokenType.DIV, 2},
new List<int>{ (int)TokenType.DIV, '=', (int)TokenType.DIV_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'%', (int)TokenType.MOD, 2},
new List<int>{ (int)TokenType.MOD, '=', (int)TokenType.MOD_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'=', (int)TokenType.ASSIGN, 2},
new List<int>{ (int)TokenType.ASSIGN,'=', (int)TokenType.EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'!', (int)TokenType.NOT, 2},
new List<int>{ (int)TokenType.NOT, '=', (int)TokenType.NOTEQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'&', (int)TokenType.BITAND, 2},
new List<int>{ (int)TokenType.BITAND, '&', (int)TokenType.AND, 1},
new List<int>{ (int)TokenType.BITAND, '=', (int)TokenType.BITAND_ASSIGN,1},
new List<int>{ (int)TokenType.DEFAULT,'|', (int)TokenType.BITOR, 2},
new List<int>{ (int)TokenType.BITOR, '|', (int)TokenType.OR, 1},
new List<int>{ (int)TokenType.BITOR, '=', (int)TokenType.BITOR_ASSIGN, 1},
new List<int>{ (int)TokenType.DEFAULT,'>', (int)TokenType.BIGGER, 2},
new List<int>{ (int)TokenType.BIGGER, '>', (int)TokenType.RIGHT_MOVE, 2},
new List<int>{ (int)TokenType.RIGHT_MOVE,'=', (int)TokenType.RIGHT_MOVE_ASSIGN,1},
new List<int>{ (int)TokenType.BIGGER, '=', (int)TokenType.BIGGER_EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'<', (int)TokenType.LESS, 2},
new List<int>{ (int)TokenType.LESS, '<', (int)TokenType.LEFT_MOVE, 2},
new List<int>{ (int)TokenType.LEFT_MOVE,'=', (int)TokenType.LEFT_MOVE_ASSIGN,1},
new List<int>{ (int)TokenType.LESS, '=', (int)TokenType.LESS_EQUAL, 1},
new List<int>{ (int)TokenType.DEFAULT,'.', (int)TokenType.DOT,2},
new List<int>{ (int)TokenType.DOT,'.', (int)TokenType.DOTDOT,3},
new List<int>{ (int)TokenType.DOTDOT,'.', (int)TokenType.ELLIPSIS,1}
};
Token getTokenBySignChar(char ch)
{
var lastState = 0;
var token = new Token(this,TokenType.DEFAULT, this.LineNumber, this.Col);
var timesAfterTwo = 0;
foreach (var item in OperateArr)
{
if (timesAfterTwo > 0)
if (++timesAfterTwo > 3) break;
if (!(item[0] == (int)token.tokenType && ((char)item[1]) == ch)) continue;
token.OrigStr += ch;
skip(ch);
ch = LookaheadChar(1);
token.tokenType = (TokenType)item[2];
lastState = item[3];
if (lastState == 1) break;
if (lastState == 2 && timesAfterTwo==0)
timesAfterTwo = 1;
}
if (lastState == 3 || lastState==0) AST.exitMsg(string.Format("Invalid char:{0} {1}", this.LineNumber, this.Col));
token.EndLineNumber = this.LineNumber;
token.EndCol = this.Col;
return token;
}
private char GetStrChar(Token t,char shiftCh)
{
char ch = nextChar();
t.OrigStr+=ch.ToString();
if (ch != '\\') return ch;
ch = nextChar();
if (ch == '0')
{
var strAr = numStr(ch).Split('_');
var baseNumStr = strAr[0];
t.OrigStr = strAr[1];
var realNumStr = baseNumStr == "16" ? t.OrigStr.Substring(2) : t.OrigStr;
t.OrigStr = "\\" + t.OrigStr;
return (char)Convert.ToInt32(realNumStr, int.Parse(baseNumStr));
}
t.OrigStr += ch.ToString();
switch (ch)
{
case 'a':
return '\a';
case 'b':
return '\b';
case 't':
return '\t';
case 'r':
return '\r';
case 'n':
return '\n';
case 'v':
return '\v';
case 'f':
return '\f';
case '\\':
return '\\';
default:
if (ch == shiftCh) return shiftCh;
else if (ch >= '!' && ch <= '~')
throw new Exception(string.Format("非法转义字符: \'\\{0}\'", (char)ch));
else
throw new Exception(string.Format("非法转义字符: \'\\0x{0:X}\'", (int)ch));
}
}
string numStr(char ch)
{
var baseNum = 10;
var str = ch.ToString();
char chNext = LookaheadChar(1);
if (ch == '0')//以0开头的数字有三种情况 1.单独一个0 2.0x...16进制数 8.0...8进制数
{
if (chNext == 'x' || chNext == 'X')//16进制
{
skip(chNext);
str += chNext.ToString();
baseNum = 16;
}
else if (isDigital(chNext))//下一位是数字,就是8进制数了
{
baseNum = 8;
}
else
{//否则那就是单独一个0
return "10_"+str;
}
}
while (isBaseNumLetter(LookaheadChar(1),baseNum))
str += nextChar();
return baseNum.ToString("d2")+"_"+ str;
}
private Token LexNumToken(char ch)
{
var t = new Token(this,TokenType.INT, this.LineNumber, this.Col);
var strAr = numStr(ch).Split('_');
var baseNumStr = strAr[0];
t.OrigStr = strAr[1];
var realNumStr = t.OrigStr;// baseNumStr == "16" ? t.OrigStr.Substring(2) : t.OrigStr;
t.int_val = Convert.ToInt32(realNumStr, int.Parse(baseNumStr));
return t;
}
bool isIdentifierChar(char ch)
{
return isLetter(ch) || ch == '_' || isDigital(ch);
}
int getNumFromChar(char ch)
{
if (isDigital(ch))
return ch - '0';
if (ch >= 'a' && ch <= 'f')
return ch - 'a' + 10;
if (ch >= 'A' && ch <= 'F')
return ch - 'A' + 10;
throw new Exception("8进制或16进制数据错误!");
}
private bool isBaseNumLetter(char ch, int baseNum)
{
if (isDigital(ch))
return getNumFromChar(ch) < baseNum;
if (ch >= 'a' && ch <= 'f')
return getNumFromChar(ch) < baseNum;
if(ch >= 'A' && ch <= 'F')
return getNumFromChar(ch) < baseNum;
return false;
}
private bool isLetter(char ch)
{
if (ch >= 'a' && ch <= 'z')
return true;
return (ch >= 'A' && ch <= 'Z');
}
private bool isDigital(char ch)
{
return (ch >= '0' && ch <= '9');
}
public static void testPrint(string program)
{
var l=new Lexer(program);
while (true)
{
var t = l.nextToken();
Console.WriteLine(t.tokenType.ToString().PadRight(18)+":"+ t.OrigStr);
if (t.tokenType == TokenType.EOF) break;
}
}
}
public enum TokenType{
DEFAULT=0,
COMMENT=1,
EOF,
INT,
REAL,
STR,
CHAR,
ID,
AND,
OR,
ASSIGN,
NOT,
BIT_NOT,
DOT,
DOTDOT,
ELLIPSIS,
ADD,
SUB,
STAR,
DIV,
MOD,
BITAND,
BITOR,
BITXOR,
INCREASE,
DECREASE,
ADD_ASSIGN,
SUB_ASSIGN,
MUL_ASSIGN,
DIV_ASSIGN,
MOD_ASSIGN,
BITAND_ASSIGN,
BITOR_ASSIGN,
BITXOR_ASSIGN,
LEFT_MOVE,
RIGHT_MOVE,
LEFT_MOVE_ASSIGN,
RIGHT_MOVE_ASSIGN,
EQUAL,
NOTEQUAL,
BIGGER,
LESS,
BIGGER_EQUAL,
LESS_EQUAL,
LEFT_PARA,
RIGHT_PARA,
LEFT_SQUARE,
RIGHT_SQUARE,
BEGIN,
END,
COMMA,
COLON,
SEMICOLON,
QUESTIOIN_SIGN,
POINTER_TO,
}
public class Token
{
public Token(Lexer lex,TokenType tokenType,int LineNumber,int Col)
{
this.tokenType = tokenType;
this.LineNumber = LineNumber;
this.Col = Col;
this.EndLineNumber = LineNumber;
this.EndCol = Col;
this.OrigStr = this.identifier = "";
this.int_val = 0;
this.real_val = 0.0;
this.str_value = "";
this.lex = lex;
}
public int LineNumber;
public int Col;
public int EndLineNumber;
public int EndCol;
public Lexer lex;
public TokenType tokenType;
public string OrigStr;
public int int_val;
public double real_val;
public string str_value;
public string identifier;
}
}