贵州大学-编译原理实验1-词法分析器
考虑如下C语言子集
单词 | 类别编码 | 助记符 | 值 | 备注 |
---|---|---|---|---|
break | 1 | BREAK | _ | |
char | 2 | CHAR | _ | |
do | 3 | DO | _ | |
double | 4 | DOUBLE | _ | |
else | 5 | ELSE | _ | |
if | 6 | IF | _ | |
int | 7 | INT | _ | |
return | 8 | RETURN | _ | |
void | 9 | VOID | _ | |
while | 10 | WHILE | _ | |
标识符 | 11 | ID | 构成标识符的字符串 | |
常数 | 12 | NUM | 数值 | |
字符串 | 13 | STRING | 字符串 | |
+ | 14 | ADD | _ | |
– | 15 | SUB | _ | |
* | 16 | MUL | _ | |
/ | 17 | DIV | _ | |
> | 18 | GT | _ | |
>= | 19 | GE | _ | |
< | 20 | LT | _ | |
<= | 21 | LE | _ | |
== | 22 | EQ | _ | |
!= | 23 | NE | _ | |
= | 24 | ASSIGN | _ | |
{ | 25 | LB | _ | |
} | 26 | RB | _ | |
( | 27 | LR | _ | |
) | 28 | RR | _ | |
, | 29 | COMMA | _ | |
; | 30 | SEMI | _ |
单词的正则定义如下:
D = [0-9]
L = [a-zA-Z_]
H = [a-fA-F0-9]
E = [Ee][±]?{D}+
FS = (f|F|l|L)
IS = (u|U|l|L)*
(1) 标识符
id = {L}({L}|{D})*
(2) 常数
num =
0[xX]{H}+{IS}? |
0{D}+{IS}? |
{D}+{IS}? |
L?‘(\.|[^\’])+’ |
{D}+{E}{FS}? |
{D}*“.”{D}+({E})?{FS}? |
{D}+“.”{D}*({E})?{FS}?
(3) 字符串
string = L?“(\.|[^\”])*"
对给定的源程序进行词法分析,每个单词一行,以二元组的形式输出结果。
例如,下面的源程序代码
void main()
{
/* compute 1 + 2 + … + 100 */
double sum = 0.0;
double x = 1.0;
while (x <= 100) sum = sum + x;
printf(“sum = %f\n”, sum);
}
词法分析的结果为
(VOID, _)
(ID, “main”)
(LR, _)
(RR, _)
(LB, _)
(DOUBLE, _)
(ID, “sum”)
(ASSIGN, _)
(NUM, 0.0)
(SEMI, _)
(DOUBLE, _)
(ID, “x”)
(ASSIGN, _)
(NUM, 1.0)
(SEMI, _)
(WHILE, _)
(LR, _)
(ID, “x”)
(LE, _)
(NUM, 100)
(RR, _)
(ID, “sum”)
(ASSIGN, _)
(ID, “sum”)
(ADD, _)
(ID, “x”)
(SEMI, _)
(ID, “printf”)
(LR, _)
(STRING, “sum = %f\n”)
(COMMA, _)
(ID, “sum”)
(RR, _)
(SEMI, _)
(RB, _)
词法分析一般使用 lex 实现,手写代码会比较复杂;
windows 平台一般使用 flex ;lex 和flex 基本上差不多。
【Lex制作词法分析器——实验】 https://www.bilibili.com/video/BV1K84y1Y7G9/?share_source=copy_web&vd_source=a80491be26f09f5f59bd172bab9bc237
参考这个视频,挺简单的
可以自己学一下 lex 简单入门,花个10分钟应该差不多了,了解一下基本的结构
视频里的编译命令
lex C.lex
testin.c
cc lex.yy.c -ll
%{
#include <stdio.h>
#include <stdlib.h>
int count = 0;
%}
D [0-9]
L [a-zA-Z_]
H [a-fA-F0-9]
E [Ee][+-]?{D}+
FS (f|F|l|L)
IS (u|U|l|L)*
id {L}({L}|{D})*
delim [ \t\n\r]
whitespace {delim}+
num 0[xX]{H}+{IS}?|0{D}+{IS}?|{D}+{IS}?|L?'(\\.|[^\\'])+'|{D}+{E}{FS}?|{D}*"."{D}+({E})?{FS}?|{D}+"."{D}*({E})?{FS}?
string L?\"(\\.|[^\\"])*\"
zhushi \/\*([^\*^\/]*|[\*^\/*]*|[^\**\/]*)*\*\/
other .
%%
<INITIAL>"break" { count++;printf("%d\t(BREAK,_)\n",count,yytext);}
<INITIAL>"char" { count++;printf("%d\t(CHAR,_)\n",count,yytext);}
<INITIAL>"do" { count++;printf("%d\t(DO,_)\n",count,yytext);}
<INITIAL>"double" { count++;printf("%d\t(DOUBLE,_)\n",count,yytext);}
<INITIAL>"else" { count++;printf("%d\t(ELSE,_)\n",count,yytext);}
<INITIAL>"if" { count++;printf("%d\t(IF,_)\n",count,yytext);}
<INITIAL>"int" { count++;printf("%d\t(INT,_)\n",count,yytext);}
<INITIAL>"return" { count++;printf("%d\t(RETURN,_)\n",count,yytext);}
<INITIAL>"void" { count++;printf("%d\t(VOID,_)\n",count,yytext);}
<INITIAL>"while" { count++;printf("%d\t(WHILE,_)\n",count,yytext);}
<INITIAL>">=" { count++;printf("%d\t(GE,_)\n",count,yytext);}
<INITIAL>"<=" { count++;printf("%d\t(LE,_)\n",count,yytext);}
<INITIAL>"==" { count++;printf("%d\t(EQ,_)\n",count,yytext);}
<INITIAL>"!=" { count++;printf("%d\t(NE,_)\n",count,yytext);}
{id} {count++;printf("%d\t(ID,\"%s\")\n",count,yytext);}
{num} {count++;printf("%d\t(NUM,\"%s\")\n",count,yytext);}
{whitespace} {/*empty*/}
{string} {count++;printf("%d\t(string,%s)\n",count,yytext);}
{zhushi} {/*empty*/}
"(" {count++;printf("%d\t(LR,_)\n",count,yytext);}
")" {count++;printf("%d\t(RR,_)\n",count,yytext);}
"{" {count++;printf("%d\t(LB,_)\n",count,yytext);}
"}" {count++;printf("%d\t(RB,_)\n",count,yytext);}
"," {count++;printf("%d\t(COMMA,_)\n",count,yytext);}
";" {count++;printf("%d\t(SEMI,_)\n",count,yytext);}
"=" {count++;printf("%d\t(ASSIGN,_)\n",count,yytext);}
"+" {count++;printf("%d\t(ADD,_)\n",count,yytext);}
"<" {count++;printf("%d\t(LT,_)\n",count,yytext);}
">" {count++;printf("%d\t(GT,_)\n",count,yytext);}
"-" {count++;printf("%d\t(SUB,_)\n",count,yytext);}`
"/" {count++;printf("%d\t(DIV,_)\n",count,yytext);}
"*" {count++;printf("%d\t(MUL,_)\n",count,yytext);}
{other} {count++;printf("%d\t(wrong)\n",count,yytext);}
%%
int main(){
yyin=fopen("F:/UnxUtils/test.c","r");
yylex();
return 0;
}
int yywrap(){
return 1;
}
编译完成的结构
输出,其中的 ^M 是因为我这个代码是在windows 下上传到 linux 中的,而两个平台的空格表示不一样,