一、实验目的:
加深对词法分析器的工作过程的理解;加强对词法分析方法的掌握;能够采用一种编程语言实现简单的词法分析程序;能够使用自己编写的分析程序对简单的程序段进行词法分析。
二、实验内容:
1.要识别的词素:
(1)保留字或关键字:如:BEGIN、 END、VAR、INTEGER、REAL、 IF、 THEN、READ、WRITE、WHILE。
(2)运算符: 如:+、-、*、/、:=、=、>、<、>=、<=
(3)标识符: 用户定义的变量名、常数名、过程名
(4)常数: 如:10、25、100、2.3等整数或实数
(5)界符: 如:‘,’、‘.’ 、‘;’ 、‘(’ 、‘)’、‘{’、‘}’,‘:’
2.词法分析过程所要完成的任务:
(1)给出源程序(要求一段完整的代码)
(2)滤空格
(3)识别保留字
(4)识别标识符
(5)拼数
(6)拼复合词(如:=)
(7)输出源程序的token(词法单元)序列。
需要进行词法分析的源程序:
识别后的输出序列:
源代码:
package cifaapp;
import java.io.*;
import java.util.*;
//存放token序列
class token {
Integer key;
String value;
token(Integer key, String value) {
this.key = key;
this.value = value;
}
}
public class app {
// 单词种别码, 1-17为关键字种别码
public static final int CHAR = 1;
public static final int SHORT = 2;
public static final int INT = 3;
public static final int LONG = 4;
public static final int FLOAT = 5;
public static final int DOUBLE = 6;
public static final int FINAL = 7;
public static final int STATIC = 8;
public static final int IF = 9;
public static final int ELSE = 10;
public static final int WHILE = 11;
public static final int DO = 12;
public static final int FOR = 13;
public static final int BREAK = 14;
public static final int CONTINUE = 15;
public static final int VOID = 16;
public static final int RETURN = 17;
public static String key[]={"char","short","int","long","float","double","final","static","if","else","while","do","for","break","continue","void","return"};
// 20为标识符种别码
public static final int ID = 20;
// 30为常量种别码
public static final int NUM = 30;
// 31-40为运算符种别码
public static final int AS = 31; // =
public static final int EQ = 32; // ==
public static final int GT = 33; // >
public static final int LT = 34; // <
public static final int GE = 35; // >=
public static final int LE = 36; // <=
public static final int ADD = 37; // +
public static final int SUB = 38; // -
public static final int MUL = 39; // *
public static final int DIV = 40; // /
// 41-49为界限符种别码
public static final int LP = 41; // (
public static final int RP = 42; // )
public static final int LBT = 43; // [
public static final int RBT = 44; // ]
public static final int LBS = 45; // {
public static final int RBS = 46; // }
public static final int COM = 47; // ,
public static final int COL = 48; // :
public static final int SEM = 49; // ;
// -1为无法识别的字符标志码
public static final int ERROR = -1;
public static int errorNum = 0; // 记录词法分析错误的个数
public static LinkedList<String> list = new LinkedList<String>();
public static LinkedList<token> tokenlist = new LinkedList<token>();
//以行为单位读取文件内容
public static void readFileByLines() {
File file = new File("code.txt");
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
//System.out.println(tempString);
separate(tempString);
}
reader.close();
} catch (IOException e) {}
finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {}
}
}
}
//正则表达式分离字符串,放入链表中
public static void separate(String linestring) {
String temp[]=linestring.split("\\s+|\\n|(?<=\\+)|(?=\\+)|(?<=-)|(?=-)|(?<=\\*)|(?=\\*)|(?<=/)|(?=/)|(?<=\\>)|(?=\\>)|(?<=\\<)|(?=\\<)|(?<==)|(?==)|(?<=\\()|(?=\\()|(?<=\\))|(?=\\))|(?<=\\[)|(?=\\[)|(?<=])|(?=])|(?<=\\{)|(?=\\{)|(?<=})|(?=})|(?<=,)|(?=,)|(?<=:)|(?=:)|(?<=;)|(?=;)");
for(int i=0;i<temp.length;i++) {
list.add(temp[i]);
}
}
public static void analyse() {
for (int i = 0; i < list.size(); i++) {
if (list.get(i).length() == 1) {
if (list.get(i).equals("=")) { // 运算符"="
if (list.get(i+1).equals("=")) { // 若后面跟的是"=",则是运算符"=="
tokenlist.add(new token(EQ, list.get(i) + list.get(++i)));
} else { // 否则是运算符"="
tokenlist.add(new token(AS, list.get(i)));
}
} else if (list.get(i).equals(">")) { // 运算符">"
if (list.get(i+1).equals("=")) { // 若后面跟的是"=",则是运算符">="
tokenlist.add(new token(GE, list.get(i) + list.get(++i)));
} else { // 否则是运算符">"
tokenlist.add(new token(GT, list.get(i)));
}
} else if (list.get(i).equals("<")) { // 运算符"<"
if (list.get(i+1).equals("=")) { // 若后面跟的是"=",则是运算符"<="
tokenlist.add(new token(LE, list.get(i) + list.get(++i)));
} else { // 否则是运算符"<"
tokenlist.add(new token(LT, list.get(i)));
}
} else if (list.get(i).equals("+")) { // 运算符"+"
if ((list.get(i-1).equals("=") || list.get(i-1).equals("("))
&& isNum(list.get(i+1))) { // 判断是否是有符号常量(正数)
tokenlist.add(new token(NUM, list.get(i) + list.get(++i)));
} else { // 否则是运算符"+"
tokenlist.add(new token(ADD, list.get(i)));
}
} else if (list.get(i).equals("-")) { // 运算符"-"
if ((list.get(i-1).equals("=") || list.get(i-1).equals("("))
&& isNum(list.get(i+1))) { // 判断是否是有符号常量(负数)
tokenlist.add(new token(NUM, list.get(i) + list.get(++i)));
} else { // 否则是运算符"-"
tokenlist.add(new token(SUB, list.get(i)));
}
} else if (list.get(i).equals("*")) { // 运算符"*"
tokenlist.add(new token(MUL, list.get(i)));
} else if (list.get(i).equals("/")) { // 运算符"/"
tokenlist.add(new token(DIV, list.get(i)));
} else if (list.get(i).equals("(")) { // 界限符"("
tokenlist.add(new token(LP, list.get(i)));
} else if (list.get(i).equals(")")) { // 界限符")"
tokenlist.add(new token(RP, list.get(i)));
} else if (list.get(i).equals("[")) { // 界限符"["
tokenlist.add(new token(LBT, list.get(i)));
} else if (list.get(i).equals("]")) { // 界限符"]"
tokenlist.add(new token(RBT, list.get(i)));
} else if (list.get(i).equals("{")) { // 界限符"{"
tokenlist.add(new token(LBS, list.get(i)));
} else if (list.get(i).equals("}")) { // 界限符"}"
tokenlist.add(new token(RBS, list.get(i)));
} else if (list.get(i).equals(",")) { // 界限符","
tokenlist.add(new token(COM, list.get(i)));
} else if (list.get(i).equals(":")) { // 界限符":"
tokenlist.add(new token(COL, list.get(i)));
} else if (list.get(i).equals(";")) { // 界限符";"
tokenlist.add(new token(SEM, list.get(i)));
} else if (list.get(i).charAt(0) >= '0' && list.get(i).charAt(0) <= '9') { // 判断是否是一位数字常量
tokenlist.add(new token(NUM, list.get(i)));
} else if (isLetter(list.get(i).charAt(0))) { // 判断是否是一位字母标识符
tokenlist.add(new token(ID, list.get(i)));
} else { // 否则是无法识别的字符
tokenlist.add(new token(ERROR, list.get(i)));
errorNum++;
}
} else if ((list.get(i).charAt(0) >= '0' && list.get(i).charAt(0) <= '9')
|| list.get(i).charAt(0) == '.') { // 判断是否是正确的常量
if (!isNum(list.get(i))) { // 不是常量,则是无法识别的字符
tokenlist.add(new token(ERROR, list.get(i)));
errorNum++;
} else if ((list.get(i+1).charAt(0) == '+' || list.get(i+1).charAt(0) == '-')
&& isNum(list.get(i+2))) { // 判断是否是有符号的常量
tokenlist.add(new token(NUM, list.get(i) + list.get(++i) + list.get(++i)));
} else { // 否则是无符号的常量
tokenlist.add(new token(NUM, list.get(i)));
}
} else if (isKeyID(list.get(i)) != 0) { // 判断是否为关键字
tokenlist.add(new token(isKeyID(list.get(i)), list.get(i)));
} else if (isLetter(list.get(i).charAt(0)) || list.get(i).charAt(0) == '_') { // 判断是否为标识符(以字母或者下划线开头)
tokenlist.add(new token(ID, list.get(i)));
} else { // 否则是无法识别的单词
tokenlist.add(new token(ERROR, list.get(i)));
errorNum++;
}
}
}
//判断是否为数字
public static boolean isNum(String str){
for (int i = str.length();--i>=0;){
if (!Character.isDigit(str.charAt(i))){
return false;
}
}
return true;
}
//判断是否为关键字
static int isKeyID(String s){
int i;
for(i=0;i<6;i++){
if(s.equals(key[i])) {
return i;
}
}
return 0;
}
static boolean isLetter(char c)
{
if(c>='a' && c<='z')
return true;
return false;
}
public static void main(String args[]) {
readFileByLines();
analyse();
System.out.println(list);
System.out.println("词法分析结果如下:\n<单词种别码,单词> //所属类别");
for(int i=0;i<tokenlist.size();i++){
System.out.print("< " + tokenlist.get(i).key + " " + tokenlist.get(i).value + " > ");
if (tokenlist.get(i).key > 0 && tokenlist.get(i).key < 20) {
System.out.println("//关键字");
} else if (tokenlist.get(i).key == 20) {
System.out.println("//标识符");
} else if (tokenlist.get(i).key == 30) {
System.out.println("//常量");
} else if (tokenlist.get(i).key > 30 && tokenlist.get(i).key <= 40) {
System.out.println("//运算符");
} else if (tokenlist.get(i).key > 40 && tokenlist.get(i).key < 50) {
System.out.println("//界限符");
} else if (tokenlist.get(i).key == -1) {
System.out.println("//无法识别的符号");
}
}
System.out.println("词法分析结束!共" + errorNum + "个无法识别的符号");
}
}