编译原理之Tiny C 词法分析器

最新推荐文章于 2024-08-19 22:17:02 发布

cg_Amaz1ng

最新推荐文章于 2024-08-19 22:17:02 发布

阅读量4.4k

点赞数 2

分类专栏：编译原理文章标签：编译原理 c语言

本文链接：https://blog.csdn.net/cg_Amaz1ng/article/details/70213799

版权

编译原理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

最近学编译原理，其中一个实验是完成Tiny C词法分析器，写实验报告之前先贴上来。

Tiny C语言编译程序实验一 Scanner

Tiny C词法规定：
- 仅允许整数类型，不允许实数类型
- 标识符由大小写英文字母组成，最多52个。其识别按最长匹配原则
- 整数后紧跟非数字，或标识符后紧跟非字母认为是一个新Token开始
- 由{ }括起来符号串都认为是注释部分，该部分在词法分析时被过滤掉
- 识别出的Token由两个变量：currentToken，tokenString识别，其中currentToken代表Token的类属，为一个名为TokenType的枚举类型，在文件globals.h中定义；tokenString代表Token在程序中出现的形式，即其本来面目。例如整数 10的currentToken值为NUM，而tokenString值为‘10’；标识符i的currentToken值为ID，而tokenString值为‘i’

Tiny C的Token类型定义：

typedef  enum 
   { ENDFILE,ERROR,          
    IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,      /* 保留字 */
    ID,NUM,
ASSIGN,EQ,LT,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI
   } TokenType;

static struct
    { char* str;
      TokenType tok;
    } reservedWords[MAXRESERVED]
   = {{"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
      {"repeat",REPEAT},{"until",UNTIL},{"read",READ},
      {"write",WRITE}};

要求：画识别符合TINY C语言构词规则的DFA。然后用直接编码的方法构造词法分析器。
DFA状态图（即确定性有限自动机）：
这里写图片描述

按照DFA构造getToken函数，getToken返回的是一个TokenType枚举变量，如何正确分析词法在于getToken函数。
在main函数里，利用while(getToken()!=ENDFILE)以TokenType为循环基础进行词法分析。对于自上一次获取到Token，首先进入的是START初始状态毫无疑问，当不为终极态DONE的时候，不断读取字符并且用switch进入对应当前state的case里。

在每次进入getToken的循环时会读取下一个字符，当读到文件末尾的时候，该函数getNextChar()返回EOF，此时应该return ENDFILE;
在START状态下输入空格、制表符、换行符，将持续进入初始状态START，并且不保存，词法分析阶段过滤掉。
在START状态下输入= , + , - , * , / , ; , <的时候直接进入终极态，currentToken设置为响应的Token值。
在START下当输入数字的时候，进入INNUM状态，表示当前进入值输入的状态，在下一次循环中switch判断state的时候，进入INNUM状态，当输入的不是数字的时候表示当前状态终止，进入终极态DONE，currentToken设置为NUM，表示已经识别出一个NUM Token，并且回退一个字符，save设置为false。这样在tokenString中保存的就是整数。
在START下当输入字母的时候，进入INID状态，表示当前进入ID输入状态，具体的操作同上。但是每一次进入循环要判断当前保存的tokenString长度是否超过52，如果超过52，则只保留前52个字母为变量。其次，在扫描出Token为ID之后，应该调用函数reversedLookup(char * s)判断当前的tokenString有没有可能是关键字，如果是，返回关键字的Token，否则返回ID。
在START下当输入{ ,将进入INCOMMENT状态，表示当前进入了注释状态。当下一次输入为 } 或者EOF时，返回初态START，并且每次都不保存。
在START下当输入 : 的时候进入INASSIGN状态，下一次输入当输入 = 表示识别出了:= 即赋值标志，进入终极态DONE，currentToken = ASSIGN。否则报错，因为不存在单独的 : 符号。

scan.c 核心程序（用于读取已经打开的文件流，并且按照一个字符一个字符扫描得到Token）：

/****************************************************/
/* File: scan.c                                     */
/* The scanner implementation for the TINY compiler */
/****************************************************/

#include "globals.h"
#include "util.h"
#include "scan.h"

/* states in scanner DFA */
typedef enum
   { START,INASSIGN,INCOMMENT,INNUM,INID,DONE }
   StateType;

/* lexeme of identifier or reserved word */
char tokenString[MAXTOKENLEN+1];

/* BUFLEN = length of the input buffer for
   source code lines */
#define BUFLEN 256

static char lineBuf[BUFLEN]; /* holds the current line */
static int linepos = 0; /* current position in LineBuf */
static int bufsize = 0; /* current size of buffer string */
static int EOF_flag = FALSE; //False表示可以回吐字符/* corrects ungetNextChar behavior on EOF */

/* getNextChar fetches the next non-blank character
   from lineBuf, reading in a new line if lineBuf is
   exhausted */
static int getNextChar(void)
{ if (!(linepos < bufsize))
  { lineno++;
  /*
  fgets()函数用于从文件流中读取一行或指定个数的字符，其原型为：
    char * fgets(char * string, int size, FILE * stream);
参数说明：
string为一个字符数组，用来保存读取到的字符。
size为要读取的字符的个数。如果该行字符数大于size-1，则读到 size-1 个字符时结束，并在最后补充' \0'；如果该行字符数小于等于 size-1，则读取所有字符，并在最后补充 '\0'。即，每次最多读取 size-1 个字符。
stream为文件流指针。
  */
    if (fgets(lineBuf,BUFLEN-1,source)) //source,lineno来自Globals.h
    { if (EchoSource)  //EchoSource来自Globals.h
          fprintf(listing,"%4d: %s",lineno,lineBuf);//listing来自Globals.h，表示输出文件
          /*
          int fprintf (FILE* stream, const char*format, [argument])
            FILE*stream：文件指针
            const char* format：输出格式
            [argument]：附加参数列表
          */
      bufsize = strlen(lineBuf);
      linepos = 0;
      return lineBuf[linepos++];
    }
    else
    { EOF_flag = TRUE;
      return EOF;
    }
  }
  else return lineBuf[linepos++];
}

/* ungetNextChar backtracks one character
   in lineBuf *///用于回吐字符
static void ungetNextChar(void)
{ if (!EOF_flag) linepos-- ;}

/* lookup table of reserved words */
static struct
    { char* str;
      TokenType tok;
    } reservedWords[MAXRESERVED]
   = {{"if",IF},{"then",THEN},{"else",ELSE},{"end",END},
      {"repeat",REPEAT},{"until",UNTIL},{"read",READ},
      {"write",WRITE}};
 //定义保留字表

/* lookup an identifier to see if it is a reserved word */
/* uses linear search */ //进行保留字的匹配
static TokenType reservedLookup (char * s)
{ int i;
  for (i=0;i<MAXRESERVED;i++)
    if (!strcmp(s,reservedWords[i].str))
      return reservedWords[i].tok;
  return ID;
}

/****************************************/
/* the primary function of the scanner  */
/****************************************/
/* function getToken returns the
 * next token in source file
 */
TokenType getToken(void)
{  /* index for storing into tokenString */
   int tokenStringIndex = 0;
   /* holds current token to be returned */
   TokenType currentToken; //保存被识别Token的类属
   /* current state - always begins at START */
   StateType state = START;
   /* flag to indicate save to tokenString */
   int save;//标识当前字符是否保存，如空格，换行符\n、TAB符\t及注释中的任何字符
   while (state != DONE)  //DONE状态表示已识别出一个Token
   { int c = getNextChar();
     if(c==EOF){
         return ENDFILE;
     }
     //注意此时linepos指向下一个
     save = TRUE;
     switch (state)
     { case START:
         if (c==' '){
            state = START;
            save = FALSE;
         }
         if (c=='\t'){
            state = START;
            save = FALSE;
         }
         if (c=='\n'){    //这个很重要，加了才能正确解析换行后的内容
            state = START;
            save = FALSE;
         }
         else if (isdigit(c)){
           state = INNUM;
         }
                    //此处请自己填写（字符、:、空格/tab/换行、{、算符及界符等）
         else if(isalpha((int)c)){
           state = INID;
         }
         else if(c=='+'){//要保存
             state = DONE;
             currentToken = PLUS;
         }
         else if(c=='-'){//要保存
             state = DONE;
             currentToken = MINUS;
         }
         else if(c=='*'){//要保存
             state = DONE;
             currentToken = TIMES;
         }
         else if(c=='/'){//要保存
             state = DONE;
             currentToken = OVER;
         }
         else if(c=='<'){//要保存
             state = DONE;
             currentToken = LT;
         }
         else if(c==';'){//要保存
             state = DONE;
             currentToken = SEMI;
         }
         else if(c=='='){
             state = DONE;
             currentToken = EQ;
         }
         else if(c==':'){
             state = INASSIGN;
         }
         else if(c=='{'){
              ungetNextChar();
              save = FALSE;
              state = INCOMMENT;
         }
         else
             currentToken = ERROR;
         break;
       case INCOMMENT://注释的currentToken没有定义

         if (c == '}'||c == EOF){         //此处请自己填写，仅出现‘}’或EOF（注释未完结束程序）时才改变状态。
             state = START;
         }
        save = FALSE;
         break;
       case INASSIGN:
                 //此处请自己填写，‘=’或其它（出现错误）
         if (c == '='){ /* backup in the input */
           state = DONE;
           currentToken = ASSIGN;
         }
         else{
           save = FALSE;
           state = DONE;
           currentToken = ERROR;
         }

         break;
       case INNUM:
         if (!isdigit(c))
         { /* backup in the input */
           ungetNextChar();
           save = FALSE;
           state = DONE;
           currentToken = NUM;
         }
         break;
       case INID:
         if(!isalpha((int)c)){//此处请自己填写，不是字符则回吐，并进入DONE，且识别出一个ID
             ungetNextChar();
             save = FALSE;
             state = DONE;
             currentToken = ID;
         }
         if(tokenStringIndex>51){
            save = FALSE;
            state = DONE;
            currentToken = ID;
         }
         break;
       case DONE:     //不可能到
       default: /* should never happen */
         fprintf(listing,"Scanner Bug: state= %d\n",state);
         state = DONE;
         currentToken = ERROR;
         break;
     } //end switch
     //将每一次的输入保存起来，从上一次找到Token的位置开始保存
     //只有当SAVE为TRUE才保存，即还得继续往下输入；当为FALSE表示TOKEN已经找到于是不必保存
     if ((save) && (tokenStringIndex <= MAXTOKENLEN))
       tokenString[tokenStringIndex++] = (char) c;
     if (state == DONE)
     { tokenString[tokenStringIndex] = '\0';
       if (currentToken == ID)
       //匹配当前的ID是否为保留字
         currentToken = reservedLookup(tokenString);
     }
   }  //end while
   if (TraceScan) {
     fprintf(listing,"\t%d: ",lineno);
     printToken(currentToken,tokenString);
   }
   return currentToken;
} /* end getToken */

配套的一个scan.h

/****************************************************/
/* File: scan.h                                     */
/* The scanner interface for the TINY compiler      */
/****************************************************/

#ifndef _SCAN_H_
#define _SCAN_H_

/* MAXTOKENLEN is the maximum size of a token */
#define MAXTOKENLEN 40

/* tokenString array stores the lexeme of each token */
extern char tokenString[MAXTOKENLEN+1];

/* function getToken returns the 
 * next token in source file
 */
TokenType getToken(void);

#endif

globals.h 程序的一些全局声明

/****************************************************/
/* File: globals.h                                  */
/* Global types and vars for TINY compiler          */
/* must come before other include files             */
/****************************************************/

#ifndef _GLOBALS_H_
#define _GLOBALS_H_

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h> //用到其中的isdigit函数和isalpha函数
#include <string.h>

#ifndef FALSE
#define FALSE 0
#endif

#ifndef TRUE
#define TRUE 1
#endif

/* MAXRESERVED = the number of reserved words */
#define MAXRESERVED 8

typedef enum
    /* book-keeping tokens */
   {ENDFILE,ERROR,
    /* reserved words */
    IF,THEN,ELSE,END,REPEAT,UNTIL,READ,WRITE,
    /* multicharacter tokens */
    ID,NUM,
    /* special symbols */
    ASSIGN,EQ,LT,PLUS,MINUS,TIMES,OVER,LPAREN,RPAREN,SEMI
   } TokenType;

extern FILE* source; /* source code text file */
extern FILE* listing; /* listing output text file */
extern FILE* code; /* code text file for TM simulator */

extern int lineno; /* source line number for listing */

/**************************************************/
/***********   Syntax tree for parsing ************/
/**************************************************/

typedef enum {StmtK,ExpK} NodeKind;
typedef enum {IfK,RepeatK,AssignK,ReadK,WriteK} StmtKind;
typedef enum {OpK,ConstK,IdK} ExpKind;

/* ExpType is used for type checking */
typedef enum {Void,Integer,Boolean} ExpType;

#define MAXCHILDREN 3

typedef struct treeNode
   { struct treeNode * child[MAXCHILDREN];
     struct treeNode * sibling;
     int lineno;
     NodeKind nodekind;
     union { StmtKind stmt; ExpKind exp;} kind;
     union { TokenType op;
             int val;
             char * name; } attr;
     ExpType type; /* for type checking of exps */
   } TreeNode;

/**************************************************/
/***********   Flags for tracing       ************/
/**************************************************/

/* EchoSource = TRUE causes the source program to
 * be echoed to the listing file with line numbers
 * during parsing
 */
extern int EchoSource ;

/* TraceScan = TRUE causes token information to be
 * printed to the listing file as each token is
 * recognized by the scanner
 */
extern int TraceScan ;

/* TraceParse = TRUE causes the syntax tree to be
 * printed to the listing file in linearized form
 * (using indents for children)
 */
extern int TraceParse;

/* TraceAnalyze = TRUE causes symbol table inserts
 * and lookups to be reported to the listing file
 */
extern int TraceAnalyze;

/* TraceCode = TRUE causes comments to be written
 * to the TM code file as code is generated
 */
extern int TraceCode;

/* Error = TRUE prevents further passes if an error occurs */
extern int Error;
#endif

util.c 用于输出字符串

/****************************************************/
/* File: util.c                                     */
/* Utility function implementation                  */
/* for the TINY compiler                            */
/****************************************************/

#include "globals.h"
#include "util.h"
/* Procedure printToken prints a token
 * and its lexeme to the listing file
 */
void printToken( TokenType token, const char* tokenString )
{ switch (token)
  { case IF:
    case THEN:
    case ELSE:
    case END:
    case REPEAT:
    case UNTIL:
    case READ:
    case WRITE:
      fprintf(listing,
         "reserved word: %s\n",tokenString);
      break;
    case ASSIGN: fprintf(listing,":=\n"); break;
    case LT: fprintf(listing,"<\n"); break;
    case EQ: fprintf(listing,"=\n"); break;
    case LPAREN: fprintf(listing,"(\n"); break;
    case RPAREN: fprintf(listing,")\n"); break;
    case SEMI: fprintf(listing,";\n"); break;
    case PLUS: fprintf(listing,"+\n"); break;
    case MINUS: fprintf(listing,"-\n"); break;
    case TIMES: fprintf(listing,"*\n"); break;
    case OVER: fprintf(listing,"/\n"); break;
    case ENDFILE: fprintf(listing,"EOF\n"); break;
    case NUM:
      fprintf(listing,
          "NUM, val= %s\n",tokenString);
      break;
    case ID:
      fprintf(listing,
          "ID, name= %s\n",tokenString);
      break;
    case ERROR:
      fprintf(listing,
          "ERROR: %s\n",tokenString);
      break;
    default: /* should never happen */
      fprintf(listing,"Unknown token: %d\n",token);
  }
}

/* Function newStmtNode creates a new statement
 * node for syntax tree construction
 */
TreeNode * newStmtNode(StmtKind kind)
{ TreeNode * t = (TreeNode *) malloc(sizeof(TreeNode));
  int i;
  if (t==NULL)
    fprintf(listing,"Out of memory error at line %d\n",lineno);
  else {
    for (i=0;i<MAXCHILDREN;i++) t->child[i] = NULL;
    t->sibling = NULL;
    t->nodekind = StmtK;
    t->kind.stmt = kind;
    t->lineno = lineno;
  }
  return t;
}

/* Function newExpNode creates a new expression
 * node for syntax tree construction
 */
TreeNode * newExpNode(ExpKind kind)
{ TreeNode * t = (TreeNode *) malloc(sizeof(TreeNode));
  int i;
  if (t==NULL)
    fprintf(listing,"Out of memory error at line %d\n",lineno);
  else {
    for (i=0;i<MAXCHILDREN;i++) t->child[i] = NULL;
    t->sibling = NULL;
    t->nodekind = ExpK;
    t->kind.exp = kind;
    t->lineno = lineno;
    t->type = Void;
  }
  return t;
}

/* Function copyString allocates and makes a new
 * copy of an existing string
 */
char * copyString(char * s)
{ int n;
  char * t;
  if (s==NULL) return NULL;
  n = strlen(s)+1;
  //重新修改下一行代码
  t = (char *)malloc(n);
  if (t==NULL)
    fprintf(listing,"Out of memory error at line %d\n",lineno);
  else strcpy(t,s);
  return t;
}

/* Variable indentno is used by printTree to
 * store current number of spaces to indent
 */
static int indentno = 0;

/* macros to increase/decrease indentation */
#define INDENT indentno+=2
#define UNINDENT indentno-=2

/* printSpaces indents by printing spaces */
static void printSpaces(void)
{ int i;
  for (i=0;i<indentno;i++)
    fprintf(listing," ");
}

/* procedure printTree prints a syntax tree to the
 * listing file using indentation to indicate subtrees
 */
void printTree( TreeNode * tree )
{ int i;
  INDENT;
  while (tree != NULL) {
    printSpaces();
    if (tree->nodekind==StmtK)
    { switch (tree->kind.stmt) {
        case IfK:
          fprintf(listing,"If\n");
          break;
        case RepeatK:
          fprintf(listing,"Repeat\n");
          break;
        case AssignK:
          fprintf(listing,"Assign to: %s\n",tree->attr.name);
          break;
        case ReadK:
          fprintf(listing,"Read: %s\n",tree->attr.name);
          break;
        case WriteK:
          fprintf(listing,"Write\n");
          break;
        default:
          fprintf(listing,"Unknown ExpNode kind\n");
          break;
      }
    }
    else if (tree->nodekind==ExpK)
    { switch (tree->kind.exp) {
        case OpK:
          fprintf(listing,"Op: ");
          printToken(tree->attr.op,"\0");
          break;
        case ConstK:
          fprintf(listing,"Const: %d\n",tree->attr.val);
          break;
        case IdK:
          fprintf(listing,"Id: %s\n",tree->attr.name);
          break;
        default:
          fprintf(listing,"Unknown ExpNode kind\n");
          break;
      }
    }
    else fprintf(listing,"Unknown node kind\n");
    for (i=0;i<MAXCHILDREN;i++)
         printTree(tree->child[i]);
    tree = tree->sibling;
  }
  UNINDENT;
}

util.h

/****************************************************/
/* File: util.h                                     */
/* Utility functions for the TINY compiler          */
/****************************************************/

#ifndef _UTIL_H_
#define _UTIL_H_

/* Procedure printToken prints a token 
 * and its lexeme to the listing file
 */
void printToken( TokenType, const char* );

/* Function newStmtNode creates a new statement
 * node for syntax tree construction
 */
TreeNode * newStmtNode(StmtKind);

/* Function newExpNode creates a new expression 
 * node for syntax tree construction
 */
TreeNode * newExpNode(ExpKind);

/* Function copyString allocates and makes a new
 * copy of an existing string
 */
char * copyString( char * );

/* procedure printTree prints a syntax tree to the 
 * listing file using indentation to indicate subtrees
 */
void printTree( TreeNode * );

#endif

main.c 主程序

/****************************************************/
/* File: main.c                                     */
/* Main program for TINY compiler                   */
/****************************************************/

#include "globals.h"

/* set NO_PARSE to TRUE to get a scanner-only compiler */
#define NO_PARSE TRUE
/* set NO_ANALYZE to TRUE to get a parser-only compiler */
#define NO_ANALYZE FALSE

/* set NO_CODE to TRUE to get a compiler that does not
 * generate code
 */
#define NO_CODE FALSE

#include "util.h"
#if NO_PARSE
#include "scan.h"
#else
//#include "parse.h"
#if NO_ANALYZE
//#include "analyze.h"
#if NO_CODE
//#include "cgen.h"
#endif
#endif
#endif

/* allocate global variables */
int lineno = 0;
FILE * source;
FILE * listing;
FILE * code;

/* allocate and set tracing flags */
int EchoSource = TRUE;
int TraceScan = TRUE;
//关键 设置为TRUE
int TraceParse = FALSE;
int TraceAnalyze = FALSE;
int TraceCode = FALSE;

int Error = FALSE;

int main( int argc, char * argv[] )
{ TreeNode * syntaxTree;
  char pgm[120]; /* source code file name */
  if (argc != 2)
    { fprintf(stderr,"usage: %s <filename>\n",argv[0]);
      exit(1);
    }
  strcpy(pgm,argv[1]) ;
  if (strchr (pgm, '.') == NULL)
     strcat(pgm,".tny");
  source = fopen(pgm,"r");
  //文件不存在
  if (source==NULL)
  { fprintf(stderr,"File %s not found\n",pgm);
    exit(1);
  }
  //文件存在
  listing = stdout; /* send listing to screen */
  fprintf(listing,"\nTINY COMPILATION: %s\n",pgm);
//#if NO_PARSE
  while (getToken()!=ENDFILE);           /* 词法分析部分 */

//#else   printf("1\n");
  //syntaxTree = parse();                  /* 语法分析部分 */
  //if (TraceParse) {
  //  fprintf(listing,"\nSyntax tree:\n");
  //  printTree(syntaxTree);
  //}
//#if NO_ANALYZE                          /* 语义分析部分，先注释掉 */
  //if (! Error)
  //{ if (TraceAnalyze) fprintf(listing,"\nBuilding Symbol Table...\n");
  //  buildSymtab(syntaxTree);
  //  if (TraceAnalyze) fprintf(listing,"\nChecking Types...\n");
  //  typeCheck(syntaxTree);
  //  if (TraceAnalyze) fprintf(listing,"\nType Checking Finished\n");
  //}
//#if NO_CODE
  //if (! Error)
  //{ char * codefile;
  //  int fnlen = strcspn(pgm,".");
  //  codefile = (char *) calloc(fnlen+4, sizeof(char));
  //  strncpy(codefile,pgm,fnlen);
  //  strcat(codefile,".tm");
  //  code = fopen(codefile,"w");
  //  if (code == NULL)
  //  { printf("Unable to open %s\n",codefile);
  //    exit(1);
  //  }
  //  codeGen(syntaxTree,codefile);             /*  代码生成部分，先注释掉 */
  //  fclose(code);
  //}
//#endif
//#endif
//#endif
  fclose(source);
  return 0;
}

实例：SAMPLE.tny

{ Sample program
  in TINY language -
  computes factorial
}
read x; { input an integer }
if 0 < x then { don't compute if x <= 0 }
  fact := 1;
  repeat
    fact := fact * x;
    x := x - 1
  until x = 0;
  write fact  { output factorial of x }
end