编译原理实验二词法分析程序设计

最新推荐文章于 2023-07-12 17:01:18 发布
Ruik@SZTU
最新推荐文章于 2023-07-12 17:01:18 发布
阅读量3k
点赞数 23
分类专栏：编译原理文章标签： c语言开发语言 c++ 数据结构
本文链接：https://blog.csdn.net/weixin_54141552/article/details/124799136
版权
编译原理专栏收录该内容
2 篇文章
订阅专栏
博客内容涉及TINY语言的词法分析器的C语言实现，以及如何基于此实现拓展语言TINY+的词法分析器。要求包括最长匹配原则，识别不同类型的Token，错误处理等。博客还包含了词法错误的示例以及相关代码片段。
摘要由CSDN通过智能技术生成
1. 实验内容
● TINY语言的词法由TINY Syntax.ppt描述；
● TINY语言的词法分析器由TINY Scanner.rar的C语言代码实现；
● TINY+语言的词法由TINY+ Syntax.doc描述。
任务：理解TINY语言的词法及词法分析器的实现，并基于该词法分析器，实现拓展语言TINY+的词法分析器。
要求：
（1） TINY+词法分析器以TINY+源代码为输入，输出为识别出的token序列；
（2）词法分析器以最长匹配为原则，例如‘:=’应识别为赋值符号而非单独的‘：’及‘=’；
（3） Token以（种别码，属性值）表示，包含以下类型的种别码：
1. KEY为关键字；
2. SYM为系统特殊字符；
3. ID为变量；
4. NUM为数值常量；
5. STR为字符串常量。
（4）识别词法错误。词法分析器可以给出词法错误的行号并打印出对应的出错消息，主要包含以下类型的词法错误：
a) 非法字符。即不属于TINY+字母表的字符，比如$就是一个非法字符
b)字符串匹配错误，比如右部引号丢失，如‘scanner
c)注释的右部括号丢失或匹配错误，如{this is an example
#define _CRT_SECURE_NO_WARNINGS
/****************************************************/
/* File: globals.h                                  */
/* Global types and vars for TINY compiler          */
/* must come before other include files             */
/* Compiler Construction: Principles and Practice   */
/* Kenneth C. Louden                                */
/****************************************************/

#ifndef _GLOBALS_H_
#define _GLOBALS_H_

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#ifndef FALSE
#define FALSE 0
#endif

#ifndef TRUE
#define TRUE 1
#endif

/* MAXRESERVED = the number of reserved words */
#define MAXRESERVED 18

typedef enum
/* book-keeping tokens */
{
    ENDFILE, ERROR,
    /* reserved words */
    IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,TRUE1,FALSE1,OR,AND,NOT,INT,BOOL1,STRING,FLOAT,DOUBLE,DO,WHILE,
    /* multicharacter tokens */
    ID, NUM,STR,
    /* special symbols */
    ASSIGN, EQ, LT, PLUS, MINUS, TIMES, OVER, LPAREN, RPAREN, SEMI,MT,ME,LE,COMMA,UPDOX,PERCENT
} TokenType;

extern FILE* source; /* source code text file */
extern FILE* listing; /* listing output text file */
extern FILE* code; /* code text file for TM simulator */

extern int lineno; /* source line number for listing */

/**************************************************/
/***********   Syntax tree for parsing ************/
/**************************************************/

typedef enum { StmtK, ExpK } NodeKind;
typedef enum { IfK, RepeatK, AssignK, ReadK, WriteK } StmtKind;
typedef enum { OpK, ConstK, IdK } ExpKind;

/* ExpType is used for type checking */
typedef enum { Void, Integer, Boolean } ExpType;

#define MAXCHILDREN 3

typedef struct treeNode
{
    struct treeNode* child[MAXCHILDREN];
    struct treeNode* sibling;
    int lineno;
    NodeKind nodekind;
    union { StmtKind stmt; ExpKind exp; } kind;
    union {
        TokenType op;
        int val;
        char* name;
    } attr;
    ExpType type; /* for type checking of exps */
} TreeNode;

/**************************************************/
/***********   Flags for tracing       ************/
/**************************************************/

/* EchoSource = TRUE causes the source program to
 * be echoed to the listing file with line numbers
 * during parsing
 */
extern int EchoSource;

/* TraceScan = TRUE causes token information to be
 * printed to the listing file as each token is
 * recognized by the scanner
 */
extern int TraceScan;

/* TraceParse = TRUE causes the syntax tree to be
 * printed to the listing file in linearized form
 * (using indents for children)
 */
extern int TraceParse;

/* TraceAnalyze = TRUE causes symbol table inserts
 * and lookups to be reported to the listing file
 */
extern int TraceAnalyze;

/* TraceCode = TRUE causes comments to be written
 * to the TM code file as code is generated
 */
extern int TraceCode;

/* Error = TRUE prevents further passes if an error occurs */
extern int Error;
#endif

#ifndef _SCAN_H_
#define _SCAN_H_

/* MAXTOKENLEN is the maximum size of a token */
#define MAXTOKENLEN 40

/* tokenString array stores the lexeme of each token */
extern char tokenString[MAXTOKENLEN + 1];

/* function getToken returns the
 * next token in source file
 */
TokenType getToken(void);

#endif

#ifndef _UTIL_H_
#define _UTIL_H_

/* Procedure printToken prints a token
 * and its lexeme to the listing file
 */
void printToken(TokenType, const char*);

/* Function newStmtNode creates a new statement
 * node for syntax tree construction
 */
TreeNode* newStmtNode(StmtKind);

/* Function newExpNode creates a new expression
 * node for syntax tree construction
 */
TreeNode* newExpNode(ExpKind);

/* Function copyString allocates and makes a new
 * copy of an existing string
 */
char* copyString(char*);

/* procedure printTree prints a syntax tree to the
 * listing file using indentation to indicate subtrees
 */
void printTree(TreeNode*);

#endif

/* Procedure printToken prints a token
 * and its lexeme to the listing file
 */
void printToken(TokenType token, const char* tokenString)
{
    switch (token)
    {
        case IF:
        case THEN:
        case ELSE:
        case END:
        case REPEAT:
        case UNTIL:
        case READ:
        case WRITE:
        case TRUE1:
        case FALSE1:
        case OR:
        case AND:
        case NOT:
        case INT:
        case BOOL1:
        case FLOAT:
        case STRING:
        case DOUBLE:
        case DO:
        case WHILE:
            fprintf(listing,
            "KEY: %s\n", tokenString);
        break;
        case ASSIGN: fprintf(listing,"SYM: :=\n"); break;
        case LT: fprintf(listing,"SYM: <\n"); break;
        case MT: fprintf(listing,"SYM: >\n"); break;
        case LE: fprintf(listing,"SYM: <=\n"); break;
        case ME: fprintf(listing,"SYM: >=\n"); break;
        case EQ: fprintf(listing,"SYM: =\n"); break;
        case COMMA: fprintf(listing,"SYM: ,\n"); break;
        case UPDOX: fprintf(listing,"SYM: \'\n"); break;
        case PERCENT: fprintf(listing,"SYM: %\n"); break;
        case LPAREN: fprintf(listing,"SYM: (\n"); break;
        case RPAREN: fprintf(listing,"SYM: )\n"); break;
        case SEMI: fprintf(listing,"SYM: ;\n"); break;
        case PLUS: fprintf(listing,"SYM: +\n"); break;
        case MINUS: fprintf(listing,"SYM: -\n"); break;
        case TIMES: fprintf(listing,"SYM: *\n"); break;
        case OVER: fprintf(listing,"SYM: /\n"); break;
        case ENDFILE: fprintf(listing, "EOF\n"); break;
        case NUM:
            fprintf(listing,
            "NUM, val= %s\n", tokenString);
            break;
        case ID:
            fprintf(listing,
            "ID, name= %s\n", tokenString);
            break;
        case ERROR:
            fprintf(listing,
            "ERROR: %s\n", tokenString);
            break;
        case STR:
            fprintf(listing,"STR, val= %s\n",tokenString);
            break;
        default: /* should never happen */
            fprintf(listing, "Unknown token: %d\n", token);
    }
}

/* Function newStmtNode creates a new statement
 * node for syntax tree construction
 */
TreeNode* newStmtNode(StmtKind kind)
{
    TreeNode* t = (TreeNode*)malloc(sizeof(TreeNode));
    int i;
    if (t == NULL)
        fprintf(listing, "Out of memory error at line %d\n", lineno);
    else {
        for (i = 0; i < MAXCHILDREN; i++) t->child[i] = NULL;
        t->sibling = NULL;
        t->nodekind = StmtK;
        t->kind.stmt = kind;
        t->lineno = lineno;
    }
    return t;
}

/* Function newExpNode creates a new expression
 * node for syntax tree construction
 */
TreeNode* newExpNode(ExpKind kind)
{
    TreeNode* t = (TreeNode*)malloc(sizeof(TreeNode));
    int i;
    if (t == NULL)
        fprintf(listing, "Out of memory error at line %d\n", lineno);
    else {
        for (i = 0; i < MAXCHILDREN; i++) t->child[i] = NULL;
        t->sibling = NULL;
        t->nodekind = ExpK;
        t->kind.exp = kind;
        t->lineno = lineno;
        t->type = Void;
    }
    return t;
}

/* Function copyString allocates and makes a new
 * copy of an existing string
 */
char* copyString(char* s)
{
    int n;
    char* t;
    if (s == NULL) return NULL;
    n = strlen(s) + 1;
    t = (char*)malloc(n);
    if (t == NULL)
        fprintf(listing, "Out of memory error at line %d\n", lineno);
    else strcpy(t, s);
    return t;
}

/* Variable indentno is used by printTree to
 * store current number of spaces to indent
 */
static int indentno = 0;

/* macros to increase/decrease indentation */
#define INDENT indentno+=2
#define UNINDENT indentno-=2

/* printSpaces indents by printing spaces */
static void printSpaces(void)
{
    int i;
    for (i = 0; i < indentno; i++)
        fprintf(listing, " ");
}

/* procedure printTree prints a syntax tree to the
 * listing file using indentation to indicate subtrees
 */
void printTree(TreeNode* tree)
{
    int i;
    INDENT;
    while (tree != NULL) {
        printSpaces();
        if (tree->nodekind == StmtK)
        {
            switch (tree->kind.stmt) {
            case IfK:
                fprintf(listing, "If\n");
                break;
            case RepeatK:
                fprintf(listing, "Repeat\n");
                break;
            case AssignK:
                fprintf(listing, "Assign to: %s\n", tree->attr.name);
                break;
            case ReadK:
                fprintf(listing, "Read: %s\n", tree->attr.name);
                break;
            case WriteK:
                fprintf(listing, "Write\n");
                break;
            default:
                fprintf(listing, "Unknown ExpNode kind\n");
                break;
            }
        }
        else if (tree->nodekind == ExpK)
        {
            switch (tree->kind.exp) {
            case OpK:
                fprintf(listing, "Op: ");
                printToken(tree->attr.op, "\0");
                break;
            case ConstK:
                fprintf(listing, "Const: %d\n", tree->attr.val);
                break;
            case IdK:
                fprintf(listing, "Id: %s\n", tree->attr.name);
                break;
            default:
                fprintf(listing, "Unknown ExpNode kind\n");
                break;
            }
        }
        else fprintf(listing, "Unknown node kind\n");
        for (i = 0; i < MAXCHILDREN; i++)
            printTree(tree->child[i]);
        tree = tree->sibling;
    }
    UNINDENT;
}

/* states in scanner DFA */
typedef enum
{
    START, INASSIGN, INCOMMENT, INNUM, INID, DONE,INLE,INME,INUPDOX
}
StateType;

/* lexeme of identifier or reserved word */
char tokenString[MAXTOKENLEN + 1];

/* BUFLEN = length of the input buffer for
   source code lines */
#define BUFLEN 256

static char lineBuf[BUFLEN]; /* holds the current line */
static int linepos = 0; /* current position in LineBuf */
static int bufsize = 0; /* current size of buffer string */
static int EOF_flag = FALSE; /* corrects ungetNextChar behavior on EOF */

/* getNextChar fetches the next non-blank character
   from lineBuf, reading in a new line if lineBuf is
   exhausted */
static int getNextChar(void)
{
    if (!(linepos < bufsize))//读取新行
    {
        lineno++;
        if (fgets(lineBuf, BUFLEN - 1, source))
        {
            if (EchoSource)
                fprintf(listing, "%d: %s", lineno, lineBuf);
            bufsize = strlen(lineBuf);
            linepos = 0;
            return lineBuf[linepos++];
        }
        else
        {
            EOF_flag = TRUE;
            return EOF;
        }
    }
    else return lineBuf[linepos++];//返回下一个字符
}

/* ungetNextChar backtracks one character
   in lineBuf */
static void ungetNextChar(void)
{
    if (!EOF_flag) linepos--;
}

/* lookup table of reserved words */
static struct
{
    const char* str;
    TokenType tok;
} reservedWords[MAXRESERVED]
= { {"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
   {"repeat",REPEAT},{"until",UNTIL},{"read",READ},
   {"write",WRITE},{"false",FALSE1},{"true",TRUE1},{"or",OR},{"and",AND},{"not",NOT},{"int",INT},{"bool",BOOL1},{"string",STRING}
   ,{"while",WHILE},{"do",DO}};
/* lookup an identifier to see if it is a reserved word */
/* uses linear search */
//查找标识符以查看它是否是保留字！最为简单的线性查找
static TokenType reservedLookup(char* s)
{
    int i;
    for (i = 0; i < MAXRESERVED; i++)
        if (!strcmp(s, reservedWords[i].str))
            return reservedWords[i].tok;
    return ID;
}

/****************************************/
/* the primary function of the scanner  */
/****************************************/
/* function getToken returns the
 * next token in source file
 */
TokenType getToken(void)//扫描读入的文法
{  /* index for storing into tokenString */
//索引
    int tokenStringIndex = 0;
    /* holds current token to be returned *///当前词法单元
    TokenType currentToken;
    /* current state - always begins at START *///当前状态—总是从头开始
    StateType state = START;
    /* flag to indicate save to tokenString *///标志，当前字符是否有效
    int save;
    while (state != DONE)//匹配到一个词素之后就结束
    {
        int c = getNextChar();
        save = TRUE;
        switch (state)//状态转移
        {
        case START:
            if (isdigit(c))//数字
                state = INNUM;
            else if (isalpha(c))//id
                state = INID;
            else if (c == ':')//冒号
                state = INASSIGN;
            else if(c=='>')
            {
                state=INME;
            }
            else if(c=='<')
            {
                state=INLE;
            }
            else if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'))
                save = FALSE;
            else if (c == '{')//注释
            {
                save = FALSE;
                state = INCOMMENT;
            }
            else if(c=='\'')
            {
                save = FALSE;
                state = INUPDOX;
            }
            else
            {
                state = DONE;
                switch (c)
                {
                case EOF:
                    save = FALSE;
                    currentToken = ENDFILE;
                    break;
                case '=':
                    currentToken = EQ;
                    break;
                case '<':
                    currentToken = LT;
                    break;
                case '+':
                    currentToken = PLUS;
                    break;
                case '-':
                    currentToken = MINUS;
                    break;
                case '*':
                    currentToken = TIMES;
                    break;
                case '/':
                    currentToken = OVER;
                    break;
                case '(':
                    currentToken = LPAREN;
                    break;
                case ')':
                    currentToken = RPAREN;
                    break;
                case ';':
                    currentToken = SEMI;
                    break;
                case ',':
                    currentToken = COMMA;
                    break;
                case '%':
                    currentToken = PERCENT;
                    break;
                default:
                    currentToken = ERROR;
                    break;
                }
            }
            break;
        case INCOMMENT:
            save = FALSE;
            if (c == EOF)
            {
                state = DONE;
                currentToken = ERROR;
                strcpy(tokenString,"Missing \" } \" !");
	            tokenStringIndex+=15;
            }
            else if (c == '}')//直到遇到}为止
                state = START;
            break;
        case INUPDOX://冒号之后，TINY的语法，赋值语句
            if (c == '\'') 
	        {
	            save = FALSE;
	            state = DONE;
	            currentToken = STR;
	        }
	        else if (!(linepos < bufsize))
	        {
	            save = FALSE;
	            state = DONE;
	            currentToken = ERROR;
	            strcpy(tokenString,"Missing \" \' \" !");
	            tokenStringIndex+=15;
	        }
            break;
        case INASSIGN:
            state = DONE;
            if (c == '=')//:=代表赋值
                currentToken = ASSIGN;
            else
            { /* backup in the input */
                ungetNextChar();
                save = FALSE;
                currentToken = ERROR;
            }
            break;
        case INNUM:
            if (!isdigit(c))
            { /* backup in the input */
                ungetNextChar();//回退，相当于老师上课讲的jump
                save = FALSE;//不保存现在的字符
                state = DONE;//完成一个词素的匹配
                currentToken = NUM;//类型为NUM
            }
            break;
        case INLE:
            if (c=='=')
	        {  
                state = DONE;
	            currentToken = LE;
	        }
	        else
            { /* backup in the input */
                ungetNextChar();
                save = FALSE;
                state = DONE;
                currentToken = LT;
            }
            break;
        case INME:
            if (c=='=')
	        {  
                state = DONE; 
	            currentToken = ME;
	        }
	        else
            { /* backup in the input */
                ungetNextChar();
                save = FALSE;
                state = DONE;
                currentToken = MT;
            }
            break;
        case INID:
            if (!(isalpha(c)||isdigit(c)))
            { /* backup in the input */
                ungetNextChar();
                save = FALSE;
                state = DONE;
                currentToken = ID;
            }
            break;
        case DONE:
        default: /* should never happen */
            fprintf(listing, "Scanner Bug: state= %d\n", state);
            state = DONE;
            currentToken = ERROR;
            break;
        }
        if ((save) && (tokenStringIndex <= MAXTOKENLEN))
            tokenString[tokenStringIndex++] = (char)c;
        if (state == DONE)
        {
            tokenString[tokenStringIndex] = '\0';
            if (currentToken == ID)
                currentToken = reservedLookup(tokenString);//检测是否为保留字
        }
    }
    if (TraceScan) {
        fprintf(listing, "\t%d: ", lineno);
        printToken(currentToken, tokenString);
    }
    return currentToken;
} /* end getToken */

/* set NO_PARSE to TRUE to get a scanner-only compiler */
#define NO_PARSE TRUE
/* set NO_ANALYZE to TRUE to get a parser-only compiler */
#define NO_ANALYZE TURE

/* set NO_CODE to TRUE to get a compiler that does not
 * generate code
 */
#define NO_CODE FALSE

 /* allocate global variables */
int lineno = 0;
FILE* source;
FILE* listing;
FILE* code;

/* allocate and set tracing flags */
int EchoSource = TRUE;
int TraceScan = TRUE;
int TraceParse = TRUE;
int TraceAnalyze = FALSE;
int TraceCode = FALSE;

int Error = FALSE;

int main(int argc, char* argv[])
{
    TreeNode* syntaxTree;
    
 //   char pgm[120]; /* source code file name */
 //   if (argc != 2)
 //   {
 //       fprintf(stderr, "usage: %s <filename>\n", argv[0]);
 //       exit(1);
 //   }
//    strcpy(pgm, argv[1]);
//    if (strchr(pgm, '.') == NULL)
//        strcat(pgm, ".tny");

    const char* pgm = "tiny+2.txt";
    source = fopen(pgm, "r");
    if (source == NULL)
    {
        fprintf(stderr, "File %s not found\n", pgm);
        exit(1);
    }
    listing = stdout; /* send listing to screen */
    fprintf(listing, "\nTINY COMPILATION: %s\n\n", pgm);

    while (getToken() != ENDFILE);

    fclose(source);
    system("pause");
    return 0;
}