设计实现 C 语言词法分析器·了解 Flex
前置项目
安装并使用Flex
安装并使用Bison
这里需要单独安装一个flex-devel,对应的包在如下链接里
flex-devel网址
联合使用Flex和Bison
实验目的
(1)熟悉 C 语言的词法规则,了解编译器词法分析器的主要功能和实现技 术,掌握典型词法分析器构造方法,设计并实现 C 语言词法分析器;
(2)了解 Flex 工作原理和基本思想,学习使用工具自动生成词法分析器;
(3)掌握编译器从前端到后端各个模块的工作原理,词法分析模块与其他模块之间的交互过程。
实验内容
根据 C 语言的词法规则,设计识别 C 语言所有单词类的词法分析器的确定有限状态自动机,并使用Python,采用程序中心法或者数据中心法设计并实现词法分析器。词法分析器的输入为 C 语言源程序,输出为属性字流。
实验过程与方法
该实验以 C 语言作为源语言,构建 C 语言的词法分析器,对于给定的测试程 序,输出属性字符流。词法分析器的构建按照 C 语言的词法规则进行。C 语言的 发展经历了不同的阶段,早期按照 C99 标准进行编程和编译器的实现,2011 年 又对 C 语言规范进行了修订,形成了 C11(又称 C1X)。下面以 C11 为基准,对 C 语言的词法规则进行简要的描述。
C 语言的关键字包括如下单词:
C 语言标识符的定义如下:
C 语言整型常量的定义如下:
C 语言浮点型常量定义如下:
C 语言字符常量定义如下:
C 语言字符串字面量定义如下:
C 语言运算符和界限符定义如下:
程序详细设计步骤
整体思路
1.读入C语言程序
2.预处理(去除空格等)
3.扫描词法分析
Identifier标识符;
Constant常量:包括整数型Integer,浮点型Float
Char字符
String字符串
Punctuation算符:Operator运算符,Delimiter界限符
4.打印输出结果
具体模块设计
读入C程序源文件
def ReadIn():
if len(sys.argv) < 2:
print("no files")
sys.exit()
filePath = sys.argv[1]
with open(filePath, 'r') as f:
content = f.readlines()
return content, filePath
预处理
def PreProcess(content):
simply = ""
for line in content:
if line != '\n':
# 不是空行的都去除空格
simply = simply + line.lstrip()
else:
simply = simply + line
simply = simply + '@'
return simply
标识符判断
# Identifier
if character.isalpha() or character == '_':
while character.isalpha() or character.isdigit() or character == '_':
codeValue = codeValue + character
character = code[codeIndex]
codeIndex = codeIndex + 1
for keyword in IdentifierKeywords:
if codeValue == keyword:
# 是关键词
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine, 'codeType': 'keyword'}
# 普通变量
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine, 'codeType': 'identifier'}
在识别了标志符后可以直接判断是不是Keyword:
#关键词
IdentifierKeywords = ['auto', 'break', 'case', 'char', 'const',
'continue', 'default', 'do', 'double', 'else',
'enum', 'extern', 'float', 'for', 'goto',
'if', 'inline', 'int', 'long', 'register',
'restrict', 'return', 'short', 'signed', 'sizeof',
'static', 'struct', 'switch', 'typedef', 'union',
'unsigned', 'void', 'volatile', 'while']
常量判断
1.整数常量中有十进制、八进制和十六进制三种常量需要考虑,其中以0开头的是八进制数字,以0x或0X开头的是十六进制数字
2.浮点型常量中需要考虑科学计数法,比如1.5e-4
3.整数常量中的后缀字符有u、U表示无符号整形 unsigned 和l、L 表示长整型 long 或long long,也就是说比如 520ull
4.浮点型常量中后缀字符有f、F、l、L,其中 f、F 表示 float类型的浮点型常量,没有 f、F 后缀的浮点型常量我们认为是 double 类型的;l、L 表示长浮点型常量,即比如1.1F、2.9L
# Integer and float
elif character.isdigit():
constantState = 0
while character.isdigit() or character in '-.xXeEaAbBcCdDfFuUlL':
codeValue = codeValue + character
if constantState == 0:
if character == '0':
constantState = 1
elif character in '123456789':
constantState = 2
elif constantState == 1:
if character in 'xX':
constantState = 3
elif character in '01234567':
constantState = 4
elif character == '.':
constantState = 5
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 2:
if character.isdigit():
constantState = 2
elif character == '.':
constantState = 5
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
elif constantState == 3:
if character in 'aAbBcCdDeEfF' or character.isdigit():
constantState = 14
elif constantState == 4:
if character in '01234567':
constantState = 4
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 5:
if character.isdigit():
constantState = 6
elif constantState == 6:
if character.isdigit():
constantState = 6
elif character in 'eE':
constantState = 7
elif character in 'fFlL':
constantState = 15
else:
constantState = -1
elif constantState == 7:
if character.isdigit():
constantState = 6
elif character == '-':
constantState = 8
elif constantState == 8:
if character.isdigit():
constantState = 6
elif constantState == 9:
if character in 'lL':
constantState = 10
elif character in 'uU':
constantState = 12
else:
constantState = -1
elif constantState == 10:
if character in 'uU':
constantState = 13
else:
constantState = -1
elif constantState == 11:
if character in 'lL':
constantState = 12
else:
constantState = -1
elif constantState == 12:
if character in 'lL':
constantState = 13
else:
constantState = -1
elif constantState == 14:
if character.isdigit() or character in 'aAbBcCdDeEfF':
constantState = 14
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 13 or constantState == 15:
if character:
constantState = -1
character = code[codeIndex]
codeIndex = codeIndex + 1
if constantState in (1, 2, 4, 9, 10, 11, 12, 13, 14):
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'integer constant'}
elif constantState == 6 or constantState == 15:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'floating constant'}
else:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'illegal constant'}
字符和字符串判断
#转义字符
Escapecharacters = ['\'', '"', '?', '\\', 'a', 'b', 'f', 'n', 'r', 't', 'v']
#String
elif character == '"':
stringState = 0
while codeIndex < len(code):
codeValue = codeValue + character
if stringState == 0:
if character == '"':
stringState = 1
elif stringState == 1:
if character == '\\':
stringState = 3
elif character == '"':
stringState = 2
break
elif stringState == 2:
break
elif stringState == 3:
if character in Escapecharacters:
stringState = 1
character = code[codeIndex]
codeIndex = codeIndex + 1
if stringState == 2:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'string'}
else:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal string'}
# Char
elif character == '\'':
charState = 0
while codeIndex < len(code):
codeValue = codeValue + character
if charState == 0:
if character == '\'':
charState = 1
elif charState == 1:
if character == '\'':
charState = 2
break
elif character == '\\':
charState = 3
elif charState == 2:
break
elif charState == 3:
if character in Escapecharacters:
charState = 1
character = code[codeIndex]
codeIndex = codeIndex + 1
if charState == 2:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'character'}
else:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal char'}
运算符判断
由于一些运算符涉及二元运算,故需单独划分区别
#运算符
Operators = ['+', '-', '&', '*', '~', '!', '/',
'^', '%', '=', '.', ':', '?', '#', '<', '>', '|', '`']
#可作为二元运算符首字符的算符
BinaryOperators = ['+', '-', '>', '<', '=', '!',
'&', '|', '*', '/', '%', '^', '#', ':', '.']
#界限符
Delimiters = ['[', ']', '(', ')', '{', '}', '\'', '"', ',', ';', '\\']
# Delimiters
elif character in Delimiters:
codeValue = codeValue + character
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'delimiter'}
# Operators
elif character in Operators:
operatorState = 0
while character in Operators:
codeValue = codeValue + character
if operatorState == 0:
if not character in BinaryOperators:
operatorState = 20
break
else:
if character == '+':
operatorState = 2
elif character == '-':
operatorState = 3
elif character == '<':
operatorState = 4
elif character == '>':
operatorState = 5
elif character == '=':
operatorState = 6
elif character == '!':
operatorState = 7
elif character == '&':
operatorState = 8
elif character == '|':
operatorState = 9
elif character == '*':
operatorState = 10
elif character == '/':
operatorState = 11
elif character == '%':
operatorState = 12
elif character == '^':
operatorState = 13
elif character == '#':
operatorState = 14
elif character == ':':
operatorState = 15
elif character == '.':
operatorState = 18
elif operatorState == 1:
break
elif operatorState == 2:
if character in '+=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 3:
if character in '-=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 4:
if character in '=:%':
operatorState = 1
break
elif character == '<':
operatorState = 16
else:
operatorState = -1
elif operatorState == 5:
if character in '=':
operatorState = 1
break
elif character == '>':
operatorState = 17
else:
operatorState = -1
elif operatorState == 6:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 7:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 8:
if character in '&=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 9:
if character in '|=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 10:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 11:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 12:
if character in '=>:':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 13:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 14:
if character == '#':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 15:
if character == '>':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 16:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 17:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 18:
if character == '.':
operatorState = 19
else:
operatorState = -1
elif operatorState == 19:
if character == '.':
operatorState = 1
break
else:
operatorState = -1
character = code[codeIndex]
codeIndex = codeIndex + 1
if 2 <= operatorState <= 18:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Unary operator'}
elif operatorState == 20:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Unary operator'}
elif operatorState == 1:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Multicast operator'}
else:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal operator'}
空行处理和结尾处理
# 换行
elif character == '\n':
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine + 1, 'codeType': 'Illegal operator'}
# 结束
elif character == '@':
codeValue = codeValue + character
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine, 'codeType': 'END'}
实验结果
对如下这个程序进行分析:
int sum(int a, int b)
{
int c = a - b + 3.5 - 2;
char dd[10] = "ddasd" ;
char d = 'e'
return a + b;
}
结果为
Num 1 Line 1 KEYWORD: int 3
Num 2 Line 1 IDENTIFIER: sum 7
Num 3 Line 1 DELIMITER: ( 8
Num 4 Line 1 KEYWORD: int 11
Num 5 Line 1 IDENTIFIER: a 13
Num 6 Line 1 DELIMITER: , 14
Num 7 Line 1 KEYWORD: int 18
Num 8 Line 1 IDENTIFIER: b 20
Num 9 Line 1 DELIMITER: ) 21
Num 10 Line 2 DELIMITER: { 23
Num 11 Line 3 KEYWORD: int 27
Num 12 Line 3 IDENTIFIER: c 29
Num 13 Line 3 UNARY OPERATOR: = 31
Num 14 Line 3 IDENTIFIER: a 33
Num 15 Line 3 UNARY OPERATOR: - 35
Num 16 Line 3 IDENTIFIER: b 37
Num 17 Line 3 UNARY OPERATOR: + 39
Num 18 Line 3 FLOATING CONSTANT: 3.5 43
Num 19 Line 3 UNARY OPERATOR: - 45
Num 20 Line 3 INTEGER CONSTANT: 2 47
Num 21 Line 3 DELIMITER: ; 48
Num 22 Line 4 KEYWORD: char 53
Num 23 Line 4 IDENTIFIER: dd 56
Num 24 Line 4 DELIMITER: [ 57
Num 25 Line 4 INTEGER CONSTANT: 10 59
Num 26 Line 4 DELIMITER: ] 60
Num 27 Line 4 UNARY OPERATOR: = 62
Num 28 Line 4 STRING: "ddasd" 70
Num 29 Line 4 DELIMITER: ; 72
Num 30 Line 5 KEYWORD: char 77
Num 31 Line 5 IDENTIFIER: d 79
Num 32 Line 5 UNARY OPERATOR: = 81
Num 33 Line 5 CHARACTER: 'e' 85
Num 34 Line 6 KEYWORD: return 92
Num 35 Line 6 IDENTIFIER: a 94
Num 36 Line 6 UNARY OPERATOR: + 96
Num 37 Line 6 IDENTIFIER: b 98
Num 38 Line 6 DELIMITER: ; 99
Num 39 Line 7 DELIMITER: } 101
Num 40 Line 8 END: @ 103
相应的XML结果:
完整代码
import sys
import xml.etree.cElementTree as ElementTree
from xml.dom import minidom
# 读入c语言文件
def ReadIn():
if len(sys.argv) < 2:
print("no files")
sys.exit()
filePath = sys.argv[1]
with open(filePath, 'r') as f:
content = f.readlines()
return content, filePath
# 去除空格
def PreProcess(content):
simply = ""
for line in content:
if line != '\n':
# 不是空行的都去除空格
simply = simply + line.lstrip()
else:
simply = simply + line
simply = simply + '@'
return simply
# 关键词
IdentifierKeywords = ['auto', 'break', 'case', 'char', 'const',
'continue', 'default', 'do', 'double', 'else',
'enum', 'extern', 'float', 'for', 'goto',
'if', 'inline', 'int', 'long', 'register',
'restrict', 'return', 'short', 'signed', 'sizeof',
'static', 'struct', 'switch', 'typedef', 'union',
'unsigned', 'void', 'volatile', 'while']
# 转义字符
Escapecharacters = ['\'', '"', '?', '\\', 'a', 'b', 'f', 'n', 'r', 't', 'v']
# 运算符
Operators = ['+', '-', '&', '*', '~', '!', '/',
'^', '%', '=', '.', ':', '?', '#', '<', '>', '|', '`']
# 可作为二元运算符首字符的算符
BinaryOperators = ['+', '-', '>', '<', '=', '!',
'&', '|', '*', '/', '%', '^', '#', ':', '.']
# 界限符
Delimiters = ['[', ']', '(', ')', '{', '}', '\'', '"', ',', ';', '\\']
def Scanner(code, codeIndex, codeLine):
# 词语内容
codeValue = ''
# 当前识别字符
character = code[codeIndex]
codeIndex = codeIndex + 1
# 跳过所有空格
while character == ' ':
character = code[codeIndex]
codeIndex = codeIndex + 1
# Identifier
if character.isalpha() or character == '_':
while character.isalpha() or character.isdigit() or character == '_':
codeValue = codeValue + character
character = code[codeIndex]
codeIndex = codeIndex + 1
for keyword in IdentifierKeywords:
if codeValue == keyword:
# 是关键词
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'keyword'}
# 普通变量
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'identifier'}
# String
elif character == '"':
stringState = 0
while codeIndex < len(code):
codeValue = codeValue + character
if stringState == 0:
if character == '"':
stringState = 1
elif stringState == 1:
if character == '\\':
stringState = 3
elif character == '"':
stringState = 2
break
elif stringState == 2:
break
elif stringState == 3:
if character in Escapecharacters:
stringState = 1
character = code[codeIndex]
codeIndex = codeIndex + 1
if stringState == 2:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'string'}
else:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal string'}
# Char
elif character == '\'':
charState = 0
while codeIndex < len(code):
codeValue = codeValue + character
if charState == 0:
if character == '\'':
charState = 1
elif charState == 1:
if character == '\'':
charState = 2
break
elif character == '\\':
charState = 3
elif charState == 2:
break
elif charState == 3:
if character in Escapecharacters:
charState = 1
character = code[codeIndex]
codeIndex = codeIndex + 1
if charState == 2:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'character'}
else:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal char'}
# Integer and float
elif character.isdigit():
constantState = 0
while character.isdigit() or character in '-.xXeEaAbBcCdDfFuUlL':
codeValue = codeValue + character
if constantState == 0:
if character == '0':
constantState = 1
elif character in '123456789':
constantState = 2
elif constantState == 1:
if character in 'xX':
constantState = 3
elif character in '01234567':
constantState = 4
elif character == '.':
constantState = 5
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 2:
if character.isdigit():
constantState = 2
elif character == '.':
constantState = 5
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
elif constantState == 3:
if character in 'aAbBcCdDeEfF' or character.isdigit():
constantState = 14
elif constantState == 4:
if character in '01234567':
constantState = 4
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 5:
if character.isdigit():
constantState = 6
elif constantState == 6:
if character.isdigit():
constantState = 6
elif character in 'eE':
constantState = 7
elif character in 'fFlL':
constantState = 15
else:
constantState = -1
elif constantState == 7:
if character.isdigit():
constantState = 6
elif character == '-':
constantState = 8
elif constantState == 8:
if character.isdigit():
constantState = 6
elif constantState == 9:
if character in 'lL':
constantState = 10
elif character in 'uU':
constantState = 12
else:
constantState = -1
elif constantState == 10:
if character in 'uU':
constantState = 13
else:
constantState = -1
elif constantState == 11:
if character in 'lL':
constantState = 12
else:
constantState = -1
elif constantState == 12:
if character in 'lL':
constantState = 13
else:
constantState = -1
elif constantState == 14:
if character.isdigit() or character in 'aAbBcCdDeEfF':
constantState = 14
elif character in 'lL':
constantState = 9
elif character in 'uU':
constantState = 11
else:
constantState = -1
elif constantState == 13 or constantState == 15:
if character:
constantState = -1
character = code[codeIndex]
codeIndex = codeIndex + 1
if constantState in (1, 2, 4, 9, 10, 11, 12, 13, 14):
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'integer constant'}
elif constantState == 6 or constantState == 15:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'floating constant'}
else:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'illegal constant'}
# Delimiters
elif character in Delimiters:
codeValue = codeValue + character
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'delimiter'}
# Operators
elif character in Operators:
operatorState = 0
while character in Operators:
codeValue = codeValue + character
if operatorState == 0:
if not character in BinaryOperators:
operatorState = 20
break
else:
if character == '+':
operatorState = 2
elif character == '-':
operatorState = 3
elif character == '<':
operatorState = 4
elif character == '>':
operatorState = 5
elif character == '=':
operatorState = 6
elif character == '!':
operatorState = 7
elif character == '&':
operatorState = 8
elif character == '|':
operatorState = 9
elif character == '*':
operatorState = 10
elif character == '/':
operatorState = 11
elif character == '%':
operatorState = 12
elif character == '^':
operatorState = 13
elif character == '#':
operatorState = 14
elif character == ':':
operatorState = 15
elif character == '.':
operatorState = 18
elif operatorState == 1:
break
elif operatorState == 2:
if character in '+=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 3:
if character in '-=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 4:
if character in '=:%':
operatorState = 1
break
elif character == '<':
operatorState = 16
else:
operatorState = -1
elif operatorState == 5:
if character in '=':
operatorState = 1
break
elif character == '>':
operatorState = 17
else:
operatorState = -1
elif operatorState == 6:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 7:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 8:
if character in '&=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 9:
if character in '|=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 10:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 11:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 12:
if character in '=>:':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 13:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 14:
if character == '#':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 15:
if character == '>':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 16:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 17:
if character == '=':
operatorState = 1
break
else:
operatorState = -1
elif operatorState == 18:
if character == '.':
operatorState = 19
else:
operatorState = -1
elif operatorState == 19:
if character == '.':
operatorState = 1
break
else:
operatorState = -1
character = code[codeIndex]
codeIndex = codeIndex + 1
if 2 <= operatorState <= 18:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Unary operator'}
elif operatorState == 20:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Unary operator'}
elif operatorState == 1:
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Multicast operator'}
else:
return {'codeIndex': codeIndex - 1, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'Illegal operator'}
# 换行
elif character == '\n':
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine + 1,
'codeType': 'Illegal operator'}
# 结束
elif character == '@':
codeValue = codeValue + character
return {'codeIndex': codeIndex, 'codeValue': codeValue, 'codeLine': codeLine,
'codeType': 'END'}
# 存储xml文件
def SaveFile(xmlTree, FileName):
xmlFileName = FileName + '.xml'
xmlString = ElementTree.tostring(xmlTree)
xml = minidom.parseString(xmlString).toprettyxml(indent=' ', encoding='utf-8')
with open(xmlFileName, 'wb') as f:
f.write(xml)
# 词法分析
def LexicalAnalysis(code):
xmlTree = ElementTree.Element('project')
codeIndex = 0
codeNum = 1
codeLine = 1
while codeIndex < len(code):
Token = Scanner(code, codeIndex, codeLine)
codeIndex = Token['codeIndex']
codeLine = Token['codeLine']
if Token['codeValue'] != '':
xmlToken = ElementTree.SubElement(xmlTree, 'words')
xmlNumber = ElementTree.SubElement(xmlToken, 'numbers')
xmlValue = ElementTree.SubElement(xmlToken, 'value')
xmlType = ElementTree.SubElement(xmlToken, 'keyword')
xmlLine = ElementTree.SubElement(xmlToken, 'line')
xmlValid = ElementTree.SubElement(xmlToken, 'true')
xmlNumber.text = str(codeNum)
xmlValue.text = Token['codeValue']
xmlType.text = Token['codeType'].lower()
xmlLine.text = str(codeLine)
if 'illegal' in Token['codeType'].lower():
xmlValid.text = 'false'
else:
xmlValid.text = 'true'
print('Num', '{:>2}'.format(codeNum), 'Line', '{:>2}'.format(codeLine),
'{:>18}'.format(Token['codeType'].upper()) + ': ' + '{:<8}'.format(Token['codeValue']), codeIndex)
codeNum = codeNum + 1
return xmlTree
def main():
# 读入文件
content, fileName = ReadIn()
# 预处理
simply = PreProcess(content)
# 生成词法数
xmlTree = LexicalAnalysis(simply)
# 保存文件
SaveFile(xmlTree, fileName)
if __name__ == "__main__":
main()