最近学编译原理,其中一个实验是完成Tiny C词法分析器,写实验报告之前先贴上来。
Tiny C语言编译程序实验一 Scanner
Tiny C词法规定:
- 仅允许整数类型,不允许实数类型
- 标识符由大小写英文字母组成,最多52个。其识别按最长匹配原则
- 整数后紧跟非数字,或标识符后紧跟非字母认为是一个新Token开始
- 由{ }括起来符号串都认为是注释部分,该部分在词法分析时被过滤掉
- 识别出的Token由两个变量:currentToken,tokenString识别,其中currentToken代表Token的类属,为一个名为TokenType的枚举类型,在文件globals.h中定义;tokenString代表Token在程序中出现的形式,即其本来面目。例如整数 10的currentToken值为NUM,而tokenString值为‘10’;标识符i的currentToken值为ID,而tokenString值为‘i’
Tiny C的Token类型定义:
typedef enum
{ ENDFILE,ERROR,
IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE, /* 保留字 */
ID,NUM,
ASSIGN,EQ,LT,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI
} TokenType;
static struct
{ char* str;
TokenType tok;
} reservedWords[MAXRESERVED]
= {{"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
{"repeat",REPEAT},{"until",UNTIL},{"read",READ},
{"write",WRITE}};
要求:画识别符合TINY C语言构词规则的DFA。然后用直接编码的方法构造词法分析器。
DFA状态图(即确定性有限自动机):
按照DFA构造getToken函数,getToken返回的是一个TokenType枚举变量,如何正确分析词法在于getToken函数。
在main函数里,利用while(getToken()!=ENDFILE)以TokenType为循环基础进行词法分析。对于自上一次获取到Token,首先进入的是START初始状态毫无疑问,当不为终极态DONE的时候,不断读取字符并且用switch进入对应当前state的case里。
- 在每次进入getToken的循环时会读取下一个字符,当读到文件末尾的时候,该函数getNextChar()返回EOF,此时应该return ENDFILE;
- 在START状态下输入空格、制表符、换行符,将持续进入初始状态START,并且不保存,词法分析阶段过滤掉。
- 在START状态下输入= , + , - , * , / , ; , <的时候直接进入终极态,currentToken设置为响应的Token值。
- 在START下当输入数字的时候,进入INNUM状态,表示当前进入值输入的状态,在下一次循环中switch判断state的时候,进入INNUM状态,当输入的不是数字的时候表示当前状态终止,进入终极态DONE,currentToken设置为NUM,表示已经识别出一个NUM Token,并且回退一个字符,save设置为false。这样在tokenString中保存的就是整数。
- 在START下当输入字母的时候,进入INID状态,表示当前进入ID输入状态,具体的操作同上。但是每一次进入循环要判断当前保存的tokenString长度是否超过52,如果超过52,则只保留前52个字母为变量。 其次,在扫描出Token为ID之后,应该调用函数reversedLookup(char * s)判断当前的tokenString有没有可能是关键字,如果是,返回关键字的Token,否则返回ID。
- 在START下当输入{ ,将进入INCOMMENT状态,表示当前进入了注释状态。当下一次输入为 } 或者EOF时,返回初态START,并且每次都不保存。
- 在START下当输入 : 的时候进入INASSIGN状态,下一次输入当输入 = 表示识别出了:= 即赋值标志,进入终极态DONE,currentToken = ASSIGN。否则报错,因为不存在单独的 : 符号。
scan.c 核心程序(用于读取已经打开的文件流,并且按照一个字符一个字符扫描得到Token):
/****************************************************/
/* File: scan.c */
/* The scanner implementation for the TINY compiler */
/****************************************************/
#include "globals.h"
#include "util.h"
#include "scan.h"
/* states in scanner DFA */
typedef enum
{ START,INASSIGN,INCOMMENT,INNUM,INID,DONE }
StateType;
/* lexeme of identifier or reserved word */
char tokenString[MAXTOKENLEN+1];
/* BUFLEN = length of the input buffer for
source code lines */
#define BUFLEN 256
static char lineBuf[BUFLEN]; /* holds the current line */
static int linepos = 0; /* current position in LineBuf */
static int bufsize = 0; /* current size of buffer string */
static int EOF_flag = FALSE; //False表示可以回吐字符/* corrects ungetNextChar behavior on EOF */
/* getNextChar fetches the next non-blank character
from lineBuf, reading in a new line if lineBuf is
exhausted */
static int getNextChar(void)
{ if (!(linepos < bufsize))
{ lineno++;
/*
fgets()函数用于从文件流中读取一行或指定个数的字符,其原型为:
char * fgets(char * string, int size, FILE * stream);
参数说明:
string为一个字符数组,用来保存读取到的字符。
size为要读取的字符的个数。如果该行字符数大于size-1,则读到 size-1 个字符时结束,并在最后补充' \0';如果该行字符数小于等于 size-1,则读取所有字符,并在最后补充 '\0'。即,每次最多读取 size-1 个字符。
stream为文件流指针。
*/
if (fgets(lineBuf,BUFLEN-1,source)) //source,lineno来自Globals.h
{ if (EchoSource) //EchoSource来自Globals.h
fprintf(listing,"%4d: %s",lineno,lineBuf);//listing来自Globals.h,表示输出文件
/*
int fprintf (FILE* stream, const char*format, [argument])
FILE*stream:文件指针
const char* format:输出格式
[argument]:附加参数列表
*/
bufsize = strlen(lineBuf);
linepos = 0;
return lineBuf[linepos++];
}
else
{ EOF_flag = TRUE;
return EOF;
}
}
else return lineBuf[linepos++];
}
/* ungetNextChar backtracks one character
in lineBuf *///用于回吐字符
static void ungetNextChar(void)
{ if (!EOF_flag) linepos-- ;}
/* lookup table of reserved words */
static struct
{ char* str;
TokenType tok;
} reservedWords[MAXRESERVED]
= {{"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
{"repeat",REPEAT},{"until",UNTIL},{"read",READ},
{"write",WRITE}};
//定义保留字表
/* lookup an identifier to see if it is a reserved word */
/* uses linear search */ //进行保留字的匹配
static TokenType reservedLookup (char * s)
{ int i;
for (i=0;i<MAXRESERVED;i++)
if (!strcmp(s,reservedWords[i].str))
return reservedWords[i].tok;
return ID;
}
/****************************************/
/* the primary function of the scanner */
/****************************************/
/* function getToken returns the
* next token in source file
*/
TokenType getToken(void)
{ /* index for storing into tokenString */
int tokenStringIndex = 0;
/* holds current token to be returned */
TokenType currentToken; //保存被识别Token的类属
/* current state - always begins at START */
StateType state = START;
/* flag to indicate save to tokenString */
int save;//标识当前字符是否保存,如空格,换行符\n、TAB符\t及注释中的任何字符
while (state != DONE) //DONE状态表示已识别出一个Token
{ int c = getNextChar();
if(c==EOF){
return ENDFILE;
}
//注意此时linepos指向下一个
save = TRUE;
switch (state)
{ case START:
if (c==' '){
state = START;
save = FALSE;
}
if (c=='\t'){
state = START;
save = FALSE;
}
if (c=='\n'){ //这个很重要,加了才能正确解析换行后的内容
state = START;
save = FALSE;
}
else if (isdigit(c)){
state = INNUM;
}
//此处请自己填写(字符、:、空格/tab/换行、{、算符及界符等)
else if(isalpha((int)c)){
state = INID;
}
else if(c=='+'){//要保存
state = DONE;
currentToken = PLUS;
}
else if(c=='-'){//要保存
state = DONE;
currentToken = MINUS;
}
else if(c=='*'){//要保存
state = DONE;
currentToken = TIMES;
}
else if(c=='/'){//要保存
state = DONE;
currentToken = OVER;
}
else if(c=='<'){//要保存
state = DONE;
currentToken = LT;
}
else if(c==';'){//要保存
state = DONE;
currentToken = SEMI;
}
else if(c=='='){
state = DONE;
currentToken = EQ;
}
else if(c==':'){
state = INASSIGN;
}
else if(c=='{'){
ungetNextChar();
save = FALSE;
state = INCOMMENT;
}
else
currentToken = ERROR;
break;
case INCOMMENT://注释的currentToken没有定义
if (c == '}'||c == EOF){ //此处请自己填写,仅出现‘}’或EOF(注释未完结束程序)时才改变状态。
state = START;
}
save = FALSE;
break;
case INASSIGN:
//此处请自己填写,‘=’或其它(出现错误)
if (c == '='){ /* backup in the input */
state = DONE;
currentToken = ASSIGN;
}
else{
save = FALSE;
state = DONE;
currentToken = ERROR;
}
break;
case INNUM:
if (!isdigit(c))
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = NUM;
}
break;
case INID:
if(!isalpha((int)c)){//此处请自己填写,不是字符则回吐,并进入DONE,且识别出一个ID
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = ID;
}
if(tokenStringIndex>51){
save = FALSE;
state = DONE;
currentToken = ID;
}
break;
case DONE: //不可能到
default: /* should never happen */
fprintf(listing,"Scanner Bug: state= %d\n",state);
state = DONE;
currentToken = ERROR;
break;
} //end switch
//将每一次的输入保存起来,从上一次找到Token的位置开始保存
//只有当SAVE为TRUE才保存,即还得继续往下输入;当为FALSE表示TOKEN已经找到于是不必保存
if ((save) && (tokenStringIndex <= MAXTOKENLEN))
tokenString[tokenStringIndex++] = (char) c;
if (state == DONE)
{ tokenString[tokenStringIndex] = '\0';
if (currentToken == ID)
//匹配当前的ID是否为保留字
currentToken = reservedLookup(tokenString);
}
} //end while
if (TraceScan) {
fprintf(listing,"\t%d: ",lineno);
printToken(currentToken,tokenString);
}
return currentToken;
} /* end getToken */
配套的一个scan.h
/****************************************************/
/* File: scan.h */
/* The scanner interface for the TINY compiler */
/****************************************************/
#ifndef _SCAN_H_
#define _SCAN_H_
/* MAXTOKENLEN is the maximum size of a token */
#define MAXTOKENLEN 40
/* tokenString array stores the lexeme of each token */
extern char tokenString[MAXTOKENLEN+1];
/* function getToken returns the
* next token in source file
*/
TokenType getToken(void);
#endif
globals.h 程序的一些全局声明
/****************************************************/
/* File: globals.h */
/* Global types and vars for TINY compiler */
/* must come before other include files */
/****************************************************/
#ifndef _GLOBALS_H_
#define _GLOBALS_H_
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h> //用到其中的isdigit函数和isalpha函数
#include <string.h>
#ifndef FALSE
#define FALSE 0
#endif
#ifndef TRUE
#define TRUE 1
#endif
/* MAXRESERVED = the number of reserved words */
#define MAXRESERVED 8
typedef enum
/* book-keeping tokens */
{ENDFILE,ERROR,
/* reserved words */
IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,
/* multicharacter tokens */
ID,NUM,
/* special symbols */
ASSIGN,EQ,LT,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI
} TokenType;
extern FILE* source; /* source code text file */
extern FILE* listing; /* listing output text file */
extern FILE* code; /* code text file for TM simulator */
extern int lineno; /* source line number for listing */
/**************************************************/
/*********** Syntax tree for parsing ************/
/**************************************************/
typedef enum {StmtK,ExpK} NodeKind;
typedef enum {IfK,RepeatK,AssignK,ReadK,WriteK} StmtKind;
typedef enum {OpK,ConstK,IdK} ExpKind;
/* ExpType is used for type checking */
typedef enum {Void,Integer,Boolean} ExpType;
#define MAXCHILDREN 3
typedef struct treeNode
{ struct treeNode * child[MAXCHILDREN];
struct treeNode * sibling;
int lineno;
NodeKind nodekind;
union { StmtKind stmt; ExpKind exp;} kind;
union { TokenType op;
int val;
char * name; } attr;
ExpType type; /* for type checking of exps */
} TreeNode;
/**************************************************/
/*********** Flags for tracing ************/
/**************************************************/
/* EchoSource = TRUE causes the source program to
* be echoed to the listing file with line numbers
* during parsing
*/
extern int EchoSource ;
/* TraceScan = TRUE causes token information to be
* printed to the listing file as each token is
* recognized by the scanner
*/
extern int TraceScan ;
/* TraceParse = TRUE causes the syntax tree to be
* printed to the listing file in linearized form
* (using indents for children)
*/
extern int TraceParse;
/* TraceAnalyze = TRUE causes symbol table inserts
* and lookups to be reported to the listing file
*/
extern int TraceAnalyze;
/* TraceCode = TRUE causes comments to be written
* to the TM code file as code is generated
*/
extern int TraceCode;
/* Error = TRUE prevents further passes if an error occurs */
extern int Error;
#endif
util.c 用于输出字符串
/****************************************************/
/* File: util.c */
/* Utility function implementation */
/* for the TINY compiler */
/****************************************************/
#include "globals.h"
#include "util.h"
/* Procedure printToken prints a token
* and its lexeme to the listing file
*/
void printToken( TokenType token, const char* tokenString )
{ switch (token)
{ case IF:
case THEN:
case ELSE:
case END:
case REPEAT:
case UNTIL:
case READ:
case WRITE:
fprintf(listing,
"reserved word: %s\n",tokenString);
break;
case ASSIGN: fprintf(listing,":=\n"); break;
case LT: fprintf(listing,"<\n"); break;
case EQ: fprintf(listing,"=\n"); break;
case LPAREN: fprintf(listing,"(\n"); break;
case RPAREN: fprintf(listing,")\n"); break;
case SEMI: fprintf(listing,";\n"); break;
case PLUS: fprintf(listing,"+\n"); break;
case MINUS: fprintf(listing,"-\n"); break;
case TIMES: fprintf(listing,"*\n"); break;
case OVER: fprintf(listing,"/\n"); break;
case ENDFILE: fprintf(listing,"EOF\n"); break;
case NUM:
fprintf(listing,
"NUM, val= %s\n",tokenString);
break;
case ID:
fprintf(listing,
"ID, name= %s\n",tokenString);
break;
case ERROR:
fprintf(listing,
"ERROR: %s\n",tokenString);
break;
default: /* should never happen */
fprintf(listing,"Unknown token: %d\n",token);
}
}
/* Function newStmtNode creates a new statement
* node for syntax tree construction
*/
TreeNode * newStmtNode(StmtKind kind)
{ TreeNode * t = (TreeNode *) malloc(sizeof(TreeNode));
int i;
if (t==NULL)
fprintf(listing,"Out of memory error at line %d\n",lineno);
else {
for (i=0;i<MAXCHILDREN;i++) t->child[i] = NULL;
t->sibling = NULL;
t->nodekind = StmtK;
t->kind.stmt = kind;
t->lineno = lineno;
}
return t;
}
/* Function newExpNode creates a new expression
* node for syntax tree construction
*/
TreeNode * newExpNode(ExpKind kind)
{ TreeNode * t = (TreeNode *) malloc(sizeof(TreeNode));
int i;
if (t==NULL)
fprintf(listing,"Out of memory error at line %d\n",lineno);
else {
for (i=0;i<MAXCHILDREN;i++) t->child[i] = NULL;
t->sibling = NULL;
t->nodekind = ExpK;
t->kind.exp = kind;
t->lineno = lineno;
t->type = Void;
}
return t;
}
/* Function copyString allocates and makes a new
* copy of an existing string
*/
char * copyString(char * s)
{ int n;
char * t;
if (s==NULL) return NULL;
n = strlen(s)+1;
//重新修改下一行代码
t = (char *)malloc(n);
if (t==NULL)
fprintf(listing,"Out of memory error at line %d\n",lineno);
else strcpy(t,s);
return t;
}
/* Variable indentno is used by printTree to
* store current number of spaces to indent
*/
static int indentno = 0;
/* macros to increase/decrease indentation */
#define INDENT indentno+=2
#define UNINDENT indentno-=2
/* printSpaces indents by printing spaces */
static void printSpaces(void)
{ int i;
for (i=0;i<indentno;i++)
fprintf(listing," ");
}
/* procedure printTree prints a syntax tree to the
* listing file using indentation to indicate subtrees
*/
void printTree( TreeNode * tree )
{ int i;
INDENT;
while (tree != NULL) {
printSpaces();
if (tree->nodekind==StmtK)
{ switch (tree->kind.stmt) {
case IfK:
fprintf(listing,"If\n");
break;
case RepeatK:
fprintf(listing,"Repeat\n");
break;
case AssignK:
fprintf(listing,"Assign to: %s\n",tree->attr.name);
break;
case ReadK:
fprintf(listing,"Read: %s\n",tree->attr.name);
break;
case WriteK:
fprintf(listing,"Write\n");
break;
default:
fprintf(listing,"Unknown ExpNode kind\n");
break;
}
}
else if (tree->nodekind==ExpK)
{ switch (tree->kind.exp) {
case OpK:
fprintf(listing,"Op: ");
printToken(tree->attr.op,"\0");
break;
case ConstK:
fprintf(listing,"Const: %d\n",tree->attr.val);
break;
case IdK:
fprintf(listing,"Id: %s\n",tree->attr.name);
break;
default:
fprintf(listing,"Unknown ExpNode kind\n");
break;
}
}
else fprintf(listing,"Unknown node kind\n");
for (i=0;i<MAXCHILDREN;i++)
printTree(tree->child[i]);
tree = tree->sibling;
}
UNINDENT;
}
util.h
/****************************************************/
/* File: util.h */
/* Utility functions for the TINY compiler */
/****************************************************/
#ifndef _UTIL_H_
#define _UTIL_H_
/* Procedure printToken prints a token
* and its lexeme to the listing file
*/
void printToken( TokenType, const char* );
/* Function newStmtNode creates a new statement
* node for syntax tree construction
*/
TreeNode * newStmtNode(StmtKind);
/* Function newExpNode creates a new expression
* node for syntax tree construction
*/
TreeNode * newExpNode(ExpKind);
/* Function copyString allocates and makes a new
* copy of an existing string
*/
char * copyString( char * );
/* procedure printTree prints a syntax tree to the
* listing file using indentation to indicate subtrees
*/
void printTree( TreeNode * );
#endif
main.c 主程序
/****************************************************/
/* File: main.c */
/* Main program for TINY compiler */
/****************************************************/
#include "globals.h"
/* set NO_PARSE to TRUE to get a scanner-only compiler */
#define NO_PARSE TRUE
/* set NO_ANALYZE to TRUE to get a parser-only compiler */
#define NO_ANALYZE FALSE
/* set NO_CODE to TRUE to get a compiler that does not
* generate code
*/
#define NO_CODE FALSE
#include "util.h"
#if NO_PARSE
#include "scan.h"
#else
//#include "parse.h"
#if NO_ANALYZE
//#include "analyze.h"
#if NO_CODE
//#include "cgen.h"
#endif
#endif
#endif
/* allocate global variables */
int lineno = 0;
FILE * source;
FILE * listing;
FILE * code;
/* allocate and set tracing flags */
int EchoSource = TRUE;
int TraceScan = TRUE;
//关键 设置为TRUE
int TraceParse = FALSE;
int TraceAnalyze = FALSE;
int TraceCode = FALSE;
int Error = FALSE;
int main( int argc, char * argv[] )
{ TreeNode * syntaxTree;
char pgm[120]; /* source code file name */
if (argc != 2)
{ fprintf(stderr,"usage: %s <filename>\n",argv[0]);
exit(1);
}
strcpy(pgm,argv[1]) ;
if (strchr (pgm, '.') == NULL)
strcat(pgm,".tny");
source = fopen(pgm,"r");
//文件不存在
if (source==NULL)
{ fprintf(stderr,"File %s not found\n",pgm);
exit(1);
}
//文件存在
listing = stdout; /* send listing to screen */
fprintf(listing,"\nTINY COMPILATION: %s\n",pgm);
//#if NO_PARSE
while (getToken()!=ENDFILE); /* 词法分析部分 */
//#else printf("1\n");
//syntaxTree = parse(); /* 语法分析部分 */
//if (TraceParse) {
// fprintf(listing,"\nSyntax tree:\n");
// printTree(syntaxTree);
//}
//#if NO_ANALYZE /* 语义分析部分,先注释掉 */
//if (! Error)
//{ if (TraceAnalyze) fprintf(listing,"\nBuilding Symbol Table...\n");
// buildSymtab(syntaxTree);
// if (TraceAnalyze) fprintf(listing,"\nChecking Types...\n");
// typeCheck(syntaxTree);
// if (TraceAnalyze) fprintf(listing,"\nType Checking Finished\n");
//}
//#if NO_CODE
//if (! Error)
//{ char * codefile;
// int fnlen = strcspn(pgm,".");
// codefile = (char *) calloc(fnlen+4, sizeof(char));
// strncpy(codefile,pgm,fnlen);
// strcat(codefile,".tm");
// code = fopen(codefile,"w");
// if (code == NULL)
// { printf("Unable to open %s\n",codefile);
// exit(1);
// }
// codeGen(syntaxTree,codefile); /* 代码生成部分,先注释掉 */
// fclose(code);
//}
//#endif
//#endif
//#endif
fclose(source);
return 0;
}
实例:SAMPLE.tny
{ Sample program
in TINY language -
computes factorial
}
read x; { input an integer }
if 0 < x then { don't compute if x <= 0 }
fact := 1;
repeat
fact := fact * x;
x := x - 1
until x = 0;
write fact { output factorial of x }
end
将main.c scan.c scan.h globals.h util.c util.h编译后运行,对主函数传参:SAMPLE.tny的文件绝对路径
运行结果: