JAVA编写的基于正则表达式的SNL词法分析器
主要思想是利用正则表达式将SNL代码依次分辨提取为TOKEN序列
我先把我使用的例子举出来
program p
type t1 = integer;
var integer v1,v2;
procedure
q(integer i);
var integer a;
begin
a:=i;
write(a)
end
begin
read(v1);
if v1<10
then v1:=v1+10
else v1:=v1-10
fi;
q(v1)
end.
首先第一部分是我编写的正则表达式常量类
package com.compiler.bean;
public class Match {
public final static String PROGRAM="\\bprogram\\b"; //程序开头声明
public final static String TYPE="\\btype\\b"; //类型声明
public final static String ARRAY="\\barray\\b"; //数组声明
public final static String INTEGER="\\binteger\\b"; //整数类型
public final static String CHAR="\\bchar\\b"; //字符类型
public final static String BEGIN="\\bbegin\\b"; //begin
public final static String Procedure="\\bprocedure\\b"; //procedure
public final static String THEN="\\bthen\\b";
public final static String LEFT_BRACKET="[\\[]"; // [
public final static String RIGHT_BRACKET="[\\]]"; // ]
public final static String LEFT_PARENT="[(]"; // (
public final static String RIGHT_PARENT="[)]"; // )
public final static String LESS_THAN="[<]"; // <
public final static String EQUAL="(?<!:)="; // =
public final static String ADD="[+]"; // +
public final static String SUBJECT="[-]"; // -
public final static String MULTIPLY ="[*]"; // *
public final static String DIVIDE="[/]"; // /
public final static String COMMA="[,]"; // ,
public final static String SEMICOLON="[;]"; // ;
public final static String DOT="(?<!\\.)[.](?!\\.)"; // .
public final static String ID="\\b[_a-zA-Z][_a-zA-Z0-9]*\\b"; //标识符
public final static String RECORD="\\brecord\\b"; //结构体类型语句开头
public final static String END="\\bend\\b"; //各种结尾
public final static String VAR="\\bvar\\b"; //变量定义开头
public final static String INTC="\\b\\d+\\b"; //无符号整数
public final static String ANNOTATION_HEAD="[{]"; //注释头部
public final static String ANNOTATION_TAIL="[}]"; //注释尾部
public final static String ARRAYDOT="[.][.]"; //数组下标限界符
public final static String READ="\\bread\\b"; //读文件
public final static String WRITE="\\bwrite\\b"; //写文件
public final static String IF="\\bif\\b"; //if
public final static String OF="\\bof\\b"; //of
public final static String WHILE="\\bwhile\\b"; //while
public final static String RETURN="\\breturn\\b"; //return
public final static String ELSE="\\belse\\b"; //else
public final static String FI="\\bfi\\b"; //fi
public final static String ENDWH="\\bendwh\\b"; //endwh
public final static String ASSIGNMENT=":="; //:=
public final static String EOF="EOF"; //EOF
public final static String WHITE=" "; //空格
public final static String ENTER="\n"; //回车
public final static String DO="\\bdo\\b"; //do
}
为了匹配方便,我将终极符与非终极符分别用两个枚举类储存
第一个是非终极符
package com.compiler.bean;
public enum VTCategory {
PROGRAM(0), //程序声明开头
TYPE(1), //类型声明
INTEGER(2), //整数类型
CHAR(3), //字符类型
ARRAY(4), //数组声明
BEGIN(5), //begin
PROCEDURE(6), //procedure
THEN(7), //then
LEFT_BRACKET(8), // [
RIGHT_BRACKET(9), // ]
LEFT_PARENT(10), // (
RIGHT_PARENT(11), // )
LESS_THAN(12), // <
EQUAL(13), // =
ADD(14), // +
SUBJECT(15), // -
MULTIPLY(16), // *
DIVIDE(17), // /
COMMA(18), // ,
SEMICOLON(19), // ;
DOT(20), // .
ARRAYDOT(21), // ..
ID(22), //标识符
RECORD(23), //结构体类型语句开头
END(24), //各种结尾
VAR(25), //变量定义开头
INTC(26), //无符号整数
ANNOTATION_HEAD(27), //注释头部
ANNOTATION_TAIL(28), //注释尾部
READ(29), //读文件
WRITE(30), //写文件
IF(31), //if
WHILE(32), //while
OF(33), //of
RETURN(34), //return
ELSE(35), //else
FI(36), //fi
ENDWH(37), //endwhile
ASSIGNMENT(38), //:=
EOF(39), //EOF
ENTER(40), //回车
DO(41);
private int value;
public int getValue() {
return value;
}
VTCategory(int value) {
this.value = value;
}
//do
}
第二个是终极符
package com.compiler.bean;
public enum VNCategory {
PROGRAM(0), //1
PHEAD(1), //2
PNAME(2), //3
DECP(3), //4
TDECP(4), //5
TDEC(5), //7
TDECL(6), //8
TDECM(7), //9
TID(8), //11
TDEF(9), //12
BASETYPE(10), //15
STYPE(11), //17
ARRAYTYPE(12), //19
LOW(13), //20
TOP(14), //21
RECTYPE(15), //22
FDECL(16), //23
FDECM(17), //25
IDL(18), //27
IDM(19), //28
VDECP(20), //30
VDEC(21), //32
VDECL(22), //33
VDECM(23), //34
VIDL(24), //36
VIDM(25), //37
PROCDECP(26), //39
PROCDEC(27), //41
PROCDECM(28), //42
PROCNAME(29), //44
PARAML(30), //45
PARAMDECL(31), //47
PARAMM(32), //48
PARAM(33), //50
FORML(34), //52
FIDMORE(35), //53
PROCBODY(36), //56
PBODY(37), //57
STML(38), //58
STMM(39), //59
STM(40), //61
ASSCALL(41), //67
ASSREST(42), //69
CONSTM(43), //70
LOOPSTM(44), //71
INPUTSTM(45), //72
INVAR(46), //73
OUTPUTSTM(47), //74
RETURNSTM(48), //75
CALLSTMREST(49), //76
ACTPARAML(50), //77
ACTPARAMM(51), //79
RELEXP(52), //81
OTHERREL(53), //82
EXP(54), //83
OTHERTERM(55), //84
TERM(56), //86
OTHERFACTOR(57), //87
FACTOR(58), //89
VARIABLE(59), //92
VARIM(60), //93
FVAR(61), //96
FVARM(62), //97
CMPOP(63), //99
ADDOP(64), //101
MULTOP(65); //103
private int value;
VNCategory(int value) {
this.value = value;
}
public int getValue() {
return value;
}
// public void setValue(int value) {
// this.value = value;
// }
}
有了这几个类 我们就可以进行匹配,但是不可能一个一个匹配,所以需要根据优先级划分,并且为了匹配方便,我把他们划分为四个LIst集合,首先为了存入List集合的方便 需要建立一个单独的存储格式
package com.compiler.bean;
public class Cutbean {
public String str1;
public VTCategory str2;
public Cutbean(String str1, VTCategory str2) {
this.str1 = str1;
this.str2 = str2;
}
}
以这个为基础,进行划分
package com.compiler.util;
import com.compiler.bean.Cutbean;
import com.compiler.bean.Match;
import com.compiler.bean.VTCategory;
import java.util.ArrayList;
import java.util.List;
public class CutList {
List<Cutbean> list1(){
Match match=new Match();
List<Cutbean> list = new ArrayList<>(); //匹配保留字
list.add(new Cutbean(match.PROGRAM, VTCategory.PROGRAM));
list.add(new Cutbean(match.TYPE,VTCategory.TYPE));
list.add(new Cutbean(match.ARRAY,VTCategory.ARRAY));
list.add(new Cutbean(match.INTEGER,VTCategory.INTEGER));
list.add(new Cutbean(match.CHAR,VTCategory.CHAR));
list.add(new Cutbean(match.BEGIN,VTCategory.BEGIN));
list.add(new Cutbean(match.Procedure,VTCategory.PROCEDURE));
list.add(new Cutbean(match.RECORD,VTCategory.RECORD));
list.add(new Cutbean(match.END,VTCategory.END));
list.add(new Cutbean(match.VAR,VTCategory.VAR));
list.add(new Cutbean(match.READ,VTCategory.READ));
list.add(new Cutbean(match.WHILE,VTCategory.WHILE));
list.add(new Cutbean(match.IF,VTCategory.IF));
list.add(new Cutbean(match.OF,VTCategory.OF));
list.add(new Cutbean(match.WRITE,VTCategory.WRITE));
list.add(new Cutbean(match.RETURN,VTCategory.RETURN));
list.add(new Cutbean(match.ELSE,VTCategory.ELSE));
list.add(new Cutbean(match.FI,VTCategory.FI));
list.add(new Cutbean(match.ENDWH,VTCategory.ENDWH));
list.add(new Cutbean(match.THEN,VTCategory.THEN));
list.add(new Cutbean(match.DO,VTCategory.DO));
return list;
}
List<Cutbean> list2(){
Match match=new Match();
List<Cutbean> list = new ArrayList<>(); //标识符,数字
list.add(new Cutbean(match.ID,VTCategory.ID));
list.add(new Cutbean(match.INTC,VTCategory.INTC));
return list;
}
List<Cutbean> list3(){
Match match=new Match();
List<Cutbean> list = new ArrayList<>(); //各种符号
list.add(new Cutbean(match.LEFT_BRACKET,VTCategory.LEFT_BRACKET));
list.add(new Cutbean(match.RIGHT_BRACKET,VTCategory.RIGHT_BRACKET));
list.add(new Cutbean(match.LEFT_PARENT,VTCategory.LEFT_PARENT));
list.add(new Cutbean(match.RIGHT_PARENT,VTCategory.RIGHT_PARENT));
list.add(new Cutbean(match.LESS_THAN,VTCategory.LESS_THAN));
list.add(new Cutbean(match.EQUAL,VTCategory.EQUAL));
list.add(new Cutbean(match.ADD,VTCategory.ADD));
list.add(new Cutbean(match.SUBJECT,VTCategory.SUBJECT));
list.add(new Cutbean(match.MULTIPLY,VTCategory.MULTIPLY));
list.add(new Cutbean(match.DIVIDE,VTCategory.DIVIDE));
list.add(new Cutbean(match.COMMA,VTCategory.COMMA));
list.add(new Cutbean(match.SEMICOLON,VTCategory.SEMICOLON));
list.add(new Cutbean(match.DOT,VTCategory.DOT));
list.add(new Cutbean(match.ANNOTATION_HEAD,VTCategory.ANNOTATION_HEAD));
list.add(new Cutbean(match.ANNOTATION_TAIL,VTCategory.ANNOTATION_TAIL));
list.add(new Cutbean(match.ARRAYDOT,VTCategory.ARRAYDOT));
list.add(new Cutbean(match.ASSIGNMENT,VTCategory.ASSIGNMENT));
return list;
}
List<Cutbean> list4(){
Match match=new Match();
List<Cutbean> list = new ArrayList<>(); //回车
list.add(new Cutbean(match.ENTER,VTCategory.ENTER));
return list;
}
}
划分完后,将txt文件导入,这里导入了org.apache.commons.io这个包,比较方便
package com.compiler.util;
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.IOException;
public class FileRead {
public String readfile(){
String str = null;
try {
str = FileUtils.readFileToString(new File("D:\\BaiduNetdiskDownload\\text.txt"), "UTF-8");//读文件内容并保留到String字符串中
} catch (IOException e) {
e.printStackTrace();
}
//str =FileUtils.readFileToString(new File("E:\\wbtext\\test.txt");
return str;
}
}
为了传入token,我准备了一个中间序列
token序列结构如下
package com.compiler.bean;
public class Token {
private String name; //内容
private VTCategory category; //类别
private int line; //所在行数
public Token() {
super();
}
public Token(VTCategory category) {
this.category = category;
}
public Token(String name, VTCategory category, int line) {
super();
this.name = name;
this.category = category;
this.line = line;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public VTCategory getCategory() {
return category;
}
public void setCategory(VTCategory category) {
this.category = category;
}
public int getLine() {
return line;
}
public void setLine(int line) {
this.line = line;
}
}
中间序列是为了方便计算行数使用,结构如下
package com.compiler.bean;
public class Centre {
public String name; //名字
public VTCategory category; //类别
public int index; //下标
public int line; //行数
public Centre(String name,VTCategory category, int index, int line) {
this.name = name;
this.category = category;
this.index = index;
this.line = line;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public VTCategory getCategory() {
return category;
}
public void setCategory(VTCategory category) {
this.category = category;
}
public int getIndex() {
return index;
}
public void setIndex(int index) {
this.index = index;
}
public int getLine() {
return line;
}
public void setLine(int line) {
this.line = line;
}
}
最后有了这些准备,就可以进行核心的匹配与挑选,有一些问题需要注意,保留字和标识符不能两次匹配,所以先匹配保留字,在标识符里遇到时,只要重复不添加就可以,在计算行数的问题中,我采用的方法是和回车符的位置进行比较,至于为什么能进行这些操作,主要用到java中正则相关的Pattern,match等相关的类与方法,比如捕获组,可以去百度学习一下就明白了,还是挺好用的,最后核心的cut类如下
package com.compiler.util;
import com.compiler.bean.Centre;
import com.compiler.bean.Cutbean;
import com.compiler.bean.Token;
import com.compiler.bean.VTCategory;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
* 分割字符串
* */
public class Cut {
public List<Token> read(String str){
CutList matchlist=new CutList();
List<Centre> centrelist=new ArrayList<>();
List<Cutbean> list1=matchlist.list1();
List<Cutbean> list2=matchlist.list2();
List<Cutbean> list3=matchlist.list3();
List<Cutbean> list4=matchlist.list4();
Pattern p=Pattern.compile("");
Matcher m= p.matcher(str);
for(int i = 0; i<list1.size(); i++){ //匹配保留字
p=Pattern.compile(list1.get(i).str1);
m= p.matcher(str);
while(m.find()){
centrelist.add(new Centre(m.group(),list1.get(i).str2,m.start(),0));
}
}
int count1=centrelist.size();
Boolean flag=true;
for(int i = 0; i<list2.size(); i++){ //匹配标识符和数字
p=Pattern.compile(list2.get(i).str1);
m= p.matcher(str);
while(m.find()){
for (int j=0;j<count1;j++){
if(centrelist.get(j).name.equals(m.group())){
flag=false;
}
}
if (flag==true){
centrelist.add(new Centre(m.group(),list2.get(i).str2,m.start(),0));
}
flag=true;
}
}
for(int i = 0; i<list3.size(); i++){ //匹配各种符号
p=Pattern.compile(list3.get(i).str1);
m= p.matcher(str);
while(m.find()){
centrelist.add(new Centre(m.group(),list3.get(i).str2,m.start(),0));
}
}
for(int i = 0; i<list4.size(); i++){ //匹配回车
p=Pattern.compile(list4.get(i).str1);
m= p.matcher(str);
while(m.find()){
centrelist.add(new Centre(m.group(),list4.get(i).str2,m.start(),0));
}
}
Centre cc=new Centre("", VTCategory.ENTER,0,0);
int count2=centrelist.size();
for(int i=0;i<count2;i++){ //按照下标进行遍历排序
for(int j=0;j<count2-1;j++){
if (centrelist.get(j).index>centrelist.get(j+1).index){
cc.name=centrelist.get(j).name;
cc.category=centrelist.get(j).category;
cc.index=centrelist.get(j).index;
centrelist.get(j).name=centrelist.get(j+1).name;
centrelist.get(j).category=centrelist.get(j+1).category;
centrelist.get(j).index=centrelist.get(j+1).index;
centrelist.get(j+1).name=cc.name;
centrelist.get(j+1).category=cc.category;
centrelist.get(j+1).index=cc.index;
}
}
}
/* for (int i = 0; i < centrelist.size(); i++) {
System.out.println(centrelist.get(i).name+centrelist.get(i).catagory+centrelist.get(i).index+centrelist.get(i).line);
}*/
int a=1;
for (int i = 0; i < count2; i++) {
if (centrelist.get(i).category!=VTCategory.ENTER){ //划分行数
centrelist.get(i).line=a;
}else{
a+=1;
}
}
for (int i = 0; i < count2; i++) {
if (centrelist.get(i).category==VTCategory.ENTER){ //删除回车
centrelist.remove(i);
count2-=1;
}
}
int count4=centrelist.size(); //删除注释
boolean flag1=false;
for(int i=0;i<count4;i++){
if(centrelist.get(i).category==VTCategory.ANNOTATION_HEAD){
flag1=true;
}
if(centrelist.get(i).category==VTCategory.ANNOTATION_TAIL){
centrelist.remove(i);
count4--;
i--;
flag1=false;
}
if (flag1){
centrelist.remove(i);
count4--;
i--;
}
}
/* for (int i = 0; i < centrelist.size(); i++) {
System.out.println(centrelist.get(i).name+centrelist.get(i).category+centrelist.get(i).index+centrelist.get(i).line);
}*/
List<Token> token =new ArrayList<>();
int count3=centrelist.size();
for (int i = 0; i < count3; i++) { //传入token序列
token.add(new Token(centrelist.get(i).name,centrelist.get(i).category,centrelist.get(i).line));
}
token.add(new Token("EOF",VTCategory.EOF,-1));
/* for (int i=0;i<token.size();i++){
System.out.println(token.get(i).getCategory()+token.get(i).getName()+token.get(i).getLine());
}*/
return token;
}
}
最后进行相关调用就可以了
FileRead a=new FileRead();
Cut b=new Cut();
//b.read(a.readfile());
List<Token> token=b.read(a.readfile());
for (int i=0;i<token.size();i++){
System.out.println(token.get(i).getCategory()+" "+token.get(i).getName()+" "+token.get(i).getLine());
}
得到结果如下
别的例子应该都没问题,工程结构截图如下
希望对你有所帮助**-**