1. 实验内容
● TINY语言的词法由TINY Syntax.ppt描述;
● TINY语言的词法分析器由TINY Scanner.rar的C语言代码实现;
● TINY+语言的词法由TINY+ Syntax.doc描述。
任务:理解TINY语言的词法及词法分析器的实现,并基于该词法分析器,实现拓展语言TINY+的词法分析器。
要求:
(1) TINY+词法分析器以TINY+源代码为输入,输出为识别出的token序列;
(2) 词法分析器以最长匹配为原则,例如‘:=’应识别为赋值符号而非单独的‘:’及‘=’;
(3) Token以(种别码,属性值)表示,包含以下类型的种别码:
-
- KEY为关键字;
- SYM为系统特殊字符;
- ID为变量;
- NUM为数值常量;
- STR为字符串常量。
(4) 识别词法错误。词法分析器可以给出词法错误的行号并打印出对应的出错消息,主要包含以下类型的词法错误:
a) 非法字符。即不属于TINY+字母表的字符,比如$就是一个非法字符
b)字符串匹配错误,比如右部引号丢失,如‘scanner
c)注释的右部括号丢失或匹配错误,如{this is an example
#define _CRT_SECURE_NO_WARNINGS
/****************************************************/
/* File: globals.h */
/* Global types and vars for TINY compiler */
/* must come before other include files */
/* Compiler Construction: Principles and Practice */
/* Kenneth C. Louden */
/****************************************************/
#ifndef _GLOBALS_H_
#define _GLOBALS_H_
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#ifndef FALSE
#define FALSE 0
#endif
#ifndef TRUE
#define TRUE 1
#endif
/* MAXRESERVED = the number of reserved words */
#define MAXRESERVED 18
typedef enum
/* book-keeping tokens */
{
ENDFILE, ERROR,
/* reserved words */
IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,TRUE1,FALSE1,OR,AND,NOT,INT,BOOL1,STRING,FLOAT,DOUBLE,DO,WHILE,
/* multicharacter tokens */
ID, NUM,STR,
/* special symbols */
ASSIGN, EQ, LT, PLUS, MINUS, TIMES, OVER, LPAREN, RPAREN, SEMI,MT,ME,LE,COMMA,UPDOX,PERCENT
} TokenType;
extern FILE* source; /* source code text file */
extern FILE* listing; /* listing output text file */
extern FILE* code; /* code text file for TM simulator */
extern int lineno; /* source line number for listing */
/**************************************************/
/*********** Syntax tree for parsing ************/
/**************************************************/
typedef enum { StmtK, ExpK } NodeKind;
typedef enum { IfK, RepeatK, AssignK, ReadK, WriteK } StmtKind;
typedef enum { OpK, ConstK, IdK } ExpKind;
/* ExpType is used for type checking */
typedef enum { Void, Integer, Boolean } ExpType;
#define MAXCHILDREN 3
typedef struct treeNode
{
struct treeNode* child[MAXCHILDREN];
struct treeNode* sibling;
int lineno;
NodeKind nodekind;
union { StmtKind stmt; ExpKind exp; } kind;
union {
TokenType op;
int val;
char* name;
} attr;
ExpType type; /* for type checking of exps */
} TreeNode;
/**************************************************/
/*********** Flags for tracing ************/
/**************************************************/
/* EchoSource = TRUE causes the source program to
* be echoed to the listing file with line numbers
* during parsing
*/
extern int EchoSource;
/* TraceScan = TRUE causes token information to be
* printed to the listing file as each token is
* recognized by the scanner
*/
extern int TraceScan;
/* TraceParse = TRUE causes the syntax tree to be
* printed to the listing file in linearized form
* (using indents for children)
*/
extern int TraceParse;
/* TraceAnalyze = TRUE causes symbol table inserts
* and lookups to be reported to the listing file
*/
extern int TraceAnalyze;
/* TraceCode = TRUE causes comments to be written
* to the TM code file as code is generated
*/
extern int TraceCode;
/* Error = TRUE prevents further passes if an error occurs */
extern int Error;
#endif
#ifndef _SCAN_H_
#define _SCAN_H_
/* MAXTOKENLEN is the maximum size of a token */
#define MAXTOKENLEN 40
/* tokenString array stores the lexeme of each token */
extern char tokenString[MAXTOKENLEN + 1];
/* function getToken returns the
* next token in source file
*/
TokenType getToken(void);
#endif
#ifndef _UTIL_H_
#define _UTIL_H_
/* Procedure printToken prints a token
* and its lexeme to the listing file
*/
void printToken(TokenType, const char*);
/* Function newStmtNode creates a new statement
* node for syntax tree construction
*/
TreeNode* newStmtNode(StmtKind);
/* Function newExpNode creates a new expression
* node for syntax tree construction
*/
TreeNode* newExpNode(ExpKind);
/* Function copyString allocates and makes a new
* copy of an existing string
*/
char* copyString(char*);
/* procedure printTree prints a syntax tree to the
* listing file using indentation to indicate subtrees
*/
void printTree(TreeNode*);
#endif
/* Procedure printToken prints a token
* and its lexeme to the listing file
*/
void printToken(TokenType token, const char* tokenString)
{
switch (token)
{
case IF:
case THEN:
case ELSE:
case END:
case REPEAT:
case UNTIL:
case READ:
case WRITE:
case TRUE1:
case FALSE1:
case OR:
case AND:
case NOT:
case INT:
case BOOL1:
case FLOAT:
case STRING:
case DOUBLE:
case DO:
case WHILE:
fprintf(listing,
"KEY: %s\n", tokenString);
break;
case ASSIGN: fprintf(listing,"SYM: :=\n"); break;
case LT: fprintf(listing,"SYM: <\n"); break;
case MT: fprintf(listing,"SYM: >\n"); break;
case LE: fprintf(listing,"SYM: <=\n"); break;
case ME: fprintf(listing,"SYM: >=\n"); break;
case EQ: fprintf(listing,"SYM: =\n"); break;
case COMMA: fprintf(listing,"SYM: ,\n"); break;
case UPDOX: fprintf(listing,"SYM: \'\n"); break;
case PERCENT: fprintf(listing,"SYM: %\n"); break;
case LPAREN: fprintf(listing,"SYM: (\n"); break;
case RPAREN: fprintf(listing,"SYM: )\n"); break;
case SEMI: fprintf(listing,"SYM: ;\n"); break;
case PLUS: fprintf(listing,"SYM: +\n"); break;
case MINUS: fprintf(listing,"SYM: -\n"); break;
case TIMES: fprintf(listing,"SYM: *\n"); break;
case OVER: fprintf(listing,"SYM: /\n"); break;
case ENDFILE: fprintf(listing, "EOF\n"); break;
case NUM:
fprintf(listing,
"NUM, val= %s\n", tokenString);
break;
case ID:
fprintf(listing,
"ID, name= %s\n", tokenString);
break;
case ERROR:
fprintf(listing,
"ERROR: %s\n", tokenString);
break;
case STR:
fprintf(listing,"STR, val= %s\n",tokenString);
break;
default: /* should never happen */
fprintf(listing, "Unknown token: %d\n", token);
}
}
/* Function newStmtNode creates a new statement
* node for syntax tree construction
*/
TreeNode* newStmtNode(StmtKind kind)
{
TreeNode* t = (TreeNode*)malloc(sizeof(TreeNode));
int i;
if (t == NULL)
fprintf(listing, "Out of memory error at line %d\n", lineno);
else {
for (i = 0; i < MAXCHILDREN; i++) t->child[i] = NULL;
t->sibling = NULL;
t->nodekind = StmtK;
t->kind.stmt = kind;
t->lineno = lineno;
}
return t;
}
/* Function newExpNode creates a new expression
* node for syntax tree construction
*/
TreeNode* newExpNode(ExpKind kind)
{
TreeNode* t = (TreeNode*)malloc(sizeof(TreeNode));
int i;
if (t == NULL)
fprintf(listing, "Out of memory error at line %d\n", lineno);
else {
for (i = 0; i < MAXCHILDREN; i++) t->child[i] = NULL;
t->sibling = NULL;
t->nodekind = ExpK;
t->kind.exp = kind;
t->lineno = lineno;
t->type = Void;
}
return t;
}
/* Function copyString allocates and makes a new
* copy of an existing string
*/
char* copyString(char* s)
{
int n;
char* t;
if (s == NULL) return NULL;
n = strlen(s) + 1;
t = (char*)malloc(n);
if (t == NULL)
fprintf(listing, "Out of memory error at line %d\n", lineno);
else strcpy(t, s);
return t;
}
/* Variable indentno is used by printTree to
* store current number of spaces to indent
*/
static int indentno = 0;
/* macros to increase/decrease indentation */
#define INDENT indentno+=2
#define UNINDENT indentno-=2
/* printSpaces indents by printing spaces */
static void printSpaces(void)
{
int i;
for (i = 0; i < indentno; i++)
fprintf(listing, " ");
}
/* procedure printTree prints a syntax tree to the
* listing file using indentation to indicate subtrees
*/
void printTree(TreeNode* tree)
{
int i;
INDENT;
while (tree != NULL) {
printSpaces();
if (tree->nodekind == StmtK)
{
switch (tree->kind.stmt) {
case IfK:
fprintf(listing, "If\n");
break;
case RepeatK:
fprintf(listing, "Repeat\n");
break;
case AssignK:
fprintf(listing, "Assign to: %s\n", tree->attr.name);
break;
case ReadK:
fprintf(listing, "Read: %s\n", tree->attr.name);
break;
case WriteK:
fprintf(listing, "Write\n");
break;
default:
fprintf(listing, "Unknown ExpNode kind\n");
break;
}
}
else if (tree->nodekind == ExpK)
{
switch (tree->kind.exp) {
case OpK:
fprintf(listing, "Op: ");
printToken(tree->attr.op, "\0");
break;
case ConstK:
fprintf(listing, "Const: %d\n", tree->attr.val);
break;
case IdK:
fprintf(listing, "Id: %s\n", tree->attr.name);
break;
default:
fprintf(listing, "Unknown ExpNode kind\n");
break;
}
}
else fprintf(listing, "Unknown node kind\n");
for (i = 0; i < MAXCHILDREN; i++)
printTree(tree->child[i]);
tree = tree->sibling;
}
UNINDENT;
}
/* states in scanner DFA */
typedef enum
{
START, INASSIGN, INCOMMENT, INNUM, INID, DONE,INLE,INME,INUPDOX
}
StateType;
/* lexeme of identifier or reserved word */
char tokenString[MAXTOKENLEN + 1];
/* BUFLEN = length of the input buffer for
source code lines */
#define BUFLEN 256
static char lineBuf[BUFLEN]; /* holds the current line */
static int linepos = 0; /* current position in LineBuf */
static int bufsize = 0; /* current size of buffer string */
static int EOF_flag = FALSE; /* corrects ungetNextChar behavior on EOF */
/* getNextChar fetches the next non-blank character
from lineBuf, reading in a new line if lineBuf is
exhausted */
static int getNextChar(void)
{
if (!(linepos < bufsize))//读取新行
{
lineno++;
if (fgets(lineBuf, BUFLEN - 1, source))
{
if (EchoSource)
fprintf(listing, "%d: %s", lineno, lineBuf);
bufsize = strlen(lineBuf);
linepos = 0;
return lineBuf[linepos++];
}
else
{
EOF_flag = TRUE;
return EOF;
}
}
else return lineBuf[linepos++];//返回下一个字符
}
/* ungetNextChar backtracks one character
in lineBuf */
static void ungetNextChar(void)
{
if (!EOF_flag) linepos--;
}
/* lookup table of reserved words */
static struct
{
const char* str;
TokenType tok;
} reservedWords[MAXRESERVED]
= { {"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
{"repeat",REPEAT},{"until",UNTIL},{"read",READ},
{"write",WRITE},{"false",FALSE1},{"true",TRUE1},{"or",OR},{"and",AND},{"not",NOT},{"int",INT},{"bool",BOOL1},{"string",STRING}
,{"while",WHILE},{"do",DO}};
/* lookup an identifier to see if it is a reserved word */
/* uses linear search */
//查找标识符以查看它是否是保留字!最为简单的线性查找
static TokenType reservedLookup(char* s)
{
int i;
for (i = 0; i < MAXRESERVED; i++)
if (!strcmp(s, reservedWords[i].str))
return reservedWords[i].tok;
return ID;
}
/****************************************/
/* the primary function of the scanner */
/****************************************/
/* function getToken returns the
* next token in source file
*/
TokenType getToken(void)//扫描读入的文法
{ /* index for storing into tokenString */
//索引
int tokenStringIndex = 0;
/* holds current token to be returned *///当前词法单元
TokenType currentToken;
/* current state - always begins at START *///当前状态—总是从头开始
StateType state = START;
/* flag to indicate save to tokenString *///标志,当前字符是否有效
int save;
while (state != DONE)//匹配到一个词素之后就结束
{
int c = getNextChar();
save = TRUE;
switch (state)//状态转移
{
case START:
if (isdigit(c))//数字
state = INNUM;
else if (isalpha(c))//id
state = INID;
else if (c == ':')//冒号
state = INASSIGN;
else if(c=='>')
{
state=INME;
}
else if(c=='<')
{
state=INLE;
}
else if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'))
save = FALSE;
else if (c == '{')//注释
{
save = FALSE;
state = INCOMMENT;
}
else if(c=='\'')
{
save = FALSE;
state = INUPDOX;
}
else
{
state = DONE;
switch (c)
{
case EOF:
save = FALSE;
currentToken = ENDFILE;
break;
case '=':
currentToken = EQ;
break;
case '<':
currentToken = LT;
break;
case '+':
currentToken = PLUS;
break;
case '-':
currentToken = MINUS;
break;
case '*':
currentToken = TIMES;
break;
case '/':
currentToken = OVER;
break;
case '(':
currentToken = LPAREN;
break;
case ')':
currentToken = RPAREN;
break;
case ';':
currentToken = SEMI;
break;
case ',':
currentToken = COMMA;
break;
case '%':
currentToken = PERCENT;
break;
default:
currentToken = ERROR;
break;
}
}
break;
case INCOMMENT:
save = FALSE;
if (c == EOF)
{
state = DONE;
currentToken = ERROR;
strcpy(tokenString,"Missing \" } \" !");
tokenStringIndex+=15;
}
else if (c == '}')//直到遇到}为止
state = START;
break;
case INUPDOX://冒号之后,TINY的语法,赋值语句
if (c == '\'')
{
save = FALSE;
state = DONE;
currentToken = STR;
}
else if (!(linepos < bufsize))
{
save = FALSE;
state = DONE;
currentToken = ERROR;
strcpy(tokenString,"Missing \" \' \" !");
tokenStringIndex+=15;
}
break;
case INASSIGN:
state = DONE;
if (c == '=')//:=代表赋值
currentToken = ASSIGN;
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
currentToken = ERROR;
}
break;
case INNUM:
if (!isdigit(c))
{ /* backup in the input */
ungetNextChar();//回退,相当于老师上课讲的jump
save = FALSE;//不保存现在的字符
state = DONE;//完成一个词素的匹配
currentToken = NUM;//类型为NUM
}
break;
case INLE:
if (c=='=')
{
state = DONE;
currentToken = LE;
}
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = LT;
}
break;
case INME:
if (c=='=')
{
state = DONE;
currentToken = ME;
}
else
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = MT;
}
break;
case INID:
if (!(isalpha(c)||isdigit(c)))
{ /* backup in the input */
ungetNextChar();
save = FALSE;
state = DONE;
currentToken = ID;
}
break;
case DONE:
default: /* should never happen */
fprintf(listing, "Scanner Bug: state= %d\n", state);
state = DONE;
currentToken = ERROR;
break;
}
if ((save) && (tokenStringIndex <= MAXTOKENLEN))
tokenString[tokenStringIndex++] = (char)c;
if (state == DONE)
{
tokenString[tokenStringIndex] = '\0';
if (currentToken == ID)
currentToken = reservedLookup(tokenString);//检测是否为保留字
}
}
if (TraceScan) {
fprintf(listing, "\t%d: ", lineno);
printToken(currentToken, tokenString);
}
return currentToken;
} /* end getToken */
/* set NO_PARSE to TRUE to get a scanner-only compiler */
#define NO_PARSE TRUE
/* set NO_ANALYZE to TRUE to get a parser-only compiler */
#define NO_ANALYZE TURE
/* set NO_CODE to TRUE to get a compiler that does not
* generate code
*/
#define NO_CODE FALSE
/* allocate global variables */
int lineno = 0;
FILE* source;
FILE* listing;
FILE* code;
/* allocate and set tracing flags */
int EchoSource = TRUE;
int TraceScan = TRUE;
int TraceParse = TRUE;
int TraceAnalyze = FALSE;
int TraceCode = FALSE;
int Error = FALSE;
int main(int argc, char* argv[])
{
TreeNode* syntaxTree;
// char pgm[120]; /* source code file name */
// if (argc != 2)
// {
// fprintf(stderr, "usage: %s <filename>\n", argv[0]);
// exit(1);
// }
// strcpy(pgm, argv[1]);
// if (strchr(pgm, '.') == NULL)
// strcat(pgm, ".tny");
const char* pgm = "tiny+2.txt";
source = fopen(pgm, "r");
if (source == NULL)
{
fprintf(stderr, "File %s not found\n", pgm);
exit(1);
}
listing = stdout; /* send listing to screen */
fprintf(listing, "\nTINY COMPILATION: %s\n\n", pgm);
while (getToken() != ENDFILE);
fclose(source);
system("pause");
return 0;
}