【编译原理】PL/0编译程序之词法分析 | DFA | C语言实现

最新推荐文章于 2022-09-30 16:26:11 发布

悄悄地把鞋带系好

最新推荐文章于 2022-09-30 16:26:11 发布

阅读量2.2k

点赞数 14

分类专栏：笔记文章标签：状态机编译器

本文链接：https://blog.csdn.net/qq_44850725/article/details/114217589

版权

笔记专栏收录该内容

5 篇文章 0 订阅

订阅专栏

视频讲解

核心

遍历字符流，遇到不能识别的字符而结束本次识别时，回退一个字符，让它能被继续识别
重点：有限状态机三段式：（1）定义状态（2）根据现态和当前字符计算次态（3）更新现态 （和数字逻辑里的时序电路FSM设计差不多）
识别单词的时候分为：1-标识符关键字,2-整数,3-复合运算符,4-单独字符
类别码是1,2,3…，用enum在头文件里定义了。
单独字符的类型码统一定义在ssym数组里了，关键字的类型码统一定义在wsym数组里了。因为他们是一一对应的，用数组比较简洁。
报错做的很简陋，几乎没有
识别到token就直接输出了，万事从简（懒）…理解思想和方法就可再改进😃
C 库函数 int isalpha( int c )：判断字符是否是字母，int isdigit ( int c )：判断字符是否是数字。当然也直接用ASCII码。
C 库函数 int strcmp(const char *str1, const char *str2) 把 str1 所指向的字符串和 str2 所指向的字符串进行比较。
对于细节要耐心，注释和空格回车等字符的处理，获取下一个字符的时机🥰

图里没有画注释了，因为最后才发现把注释识别放在自动机里特别好用。

在这里插入图片描述
课本上的图

my2.c（视频P2）

#include "my2.h"

void error(int n)
{
    printf("Error %3d: %s\n", n, err_msg[n]);
}

void lexer(FILE *fp)
{
    int num = 0;          //当前识别中的数字
    int k = 0;            //当前识别中的数字的长度
    char a[MAXIDLEN + 1]; //当前识别中的标识符or关键字
    int a_index = 0;      //当前识别中的标识符or关键字的下标

    ch = fgetc(fp); //获取文件第一个字符

    while (ch != EOF)
    {
        switch (currState)
        {
        case START:
            if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')
            { //不能在switch外面忽略这些字符，在这里，他们不是无效的，他们可以表示标识符的结束等
            }
            else if (ch == '{')
            { //注释{}
                currState = COMMENT;
            }
            else if (isdigit(ch))
            {
                currState = INNUM;
                num = num * 10 + ch - '0';
                k++;
            }
            else if (isalpha(ch))
            {
                currState = INID;
                if (a_index > MAXIDLEN)
                {
                    error(26);
                    exit(1);
                }
                a[a_index] = ch;
                a_index++;
            }
            else if (ch == ':')
                currState = INBECOMES;
            else if (ch == '>')
                currState = GTR;
            else if (ch == '<')
                currState = LES;
            else
            { //单独字符直接识别
                currState = START;
                int i = 1;
                for (; i <= NSYM; i++)
                {
                    if (ch == csym[i])
                        break;
                }
                if (i <= NSYM)
                {
                    sym = ssym[i];
                    printf("(%d,%c)\n", sym, ch);
                }
                else
                {
                    error(0);
                    // exit(1);
                    printf("the char is ---%c---\n", ch);
                }
            }
            break;
        case INNUM:
            if (isdigit(ch))
            {
                num = num * 10 + ch - '0';
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_NUMBER;
                if (k > MAXNUMLEN)
                    error(25);
                else
                {
                    printf("(%d,%d)\n", sym, num);
                }
                k = 0;
                num = 0;
            }
            break;
        case COMMENT:
            if (ch == '}')
            { // 注释结束
                currState = START;
            }
            break;
        case INID:
            if (isalpha(ch) || isdigit(ch))
            {
                if (a_index > MAXIDLEN)
                {
                    error(26);
                    exit(1);
                }
                a[a_index] = ch;
                a_index++;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                a[a_index] = '\0';   // 字符数组和字符串的区别就是结尾少了\0，一定要加上！
                // 检查是否为关键字
                int i = 1;
                for (; i <= NRW; i++)
                {
                    if (strcmp(a, word[i]) == 0)
                        break;
                }
                if (i <= NRW)
                {
                    sym = wsym[i]; // symbol is a reserved word
                }
                else
                {
                    sym = SYM_IDENTIFIER; // symbol is an identifier
                }
                printf("(%d,%s)\n", sym, a);
                a_index = 0;
            }
            break;
        case INBECOMES:
            if (ch == '=')
            {
                currState = BECOMES;
            }
            else
            {
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_NULL;
            }
            break;
        case GTR:
            if (ch == '=')
            {
                currState = GEQ;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_GTR;
                printf("(%d,>)\n", sym);
            }
            break;
        case LES:
            if (ch == '=')
            {
                currState = LEQ;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_LES;
                printf("(%d,<)\n", sym);
            }
            break;
        case BECOMES: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_BECOMES;
            printf("(%d,:=)\n", sym);
            break;
        case GEQ: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_GEQ;
            printf("%d,>=\n", sym);
            break;
        case LEQ: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_LEQ;
            printf("%d,<=\n", sym);
            break;
        }

        //在最后获取下一个字符
        ch = fgetc(fp);
    }
    printf("—————文件读取结束—————");
}

int main()
{
    //获取待检验文件的指针
    FILE *fp = fopen("source.txt", "r");
    if (!fp)
    {
        printf("文件不存在");
    }
    else //将待检验文件放入词法分析器进行分析
        lexer(fp);
    return 0;
}

my2.h

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define NRW        11     // number of reserved words
#define MAXNUMLEN  14     // maximum number of digits in numbers
#define NSYM       10     // maximum number of symbols in array ssym and csym
#define MAXIDLEN   10     // length of identifiers

char ch;         // last character read
int  sym;        // last symbol read

//相比上一篇文章的代码，本头文件中增加了状态state和现态currState
enum state {
	START,INNUM,INID,INBECOMES,BECOMES,GTR,GEQ,NEQ,LES,LEQ,END,COMMENT
};
int currState = START;        //现态

char csym[NSYM + 1] = {
	' ', '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
};

//关键字
char* word[NRW + 1] = {
	"", /* place holder */
	"begin", "call", "const", "do", "end","if",
	"odd", "procedure", "then", "var", "while"
};
//类别码
enum symtype {
	SYM_NULL,	SYM_IDENTIFIER,	SYM_NUMBER,	SYM_PLUS,	SYM_MINUS,	SYM_TIMES,	SYM_SLASH,	SYM_ODD,	SYM_EQU,	SYM_NEQ,	SYM_LES,	SYM_LEQ,	SYM_GTR,	SYM_GEQ,	SYM_LPAREN,	SYM_RPAREN,	SYM_COMMA,	SYM_SEMICOLON,	SYM_PERIOD,	SYM_BECOMES,    SYM_BEGIN,	SYM_END,	SYM_IF,	SYM_THEN,	SYM_WHILE,	SYM_DO,	SYM_CALL,	SYM_CONST,	SYM_VAR,	SYM_PROCEDURE
};
int wsym[NRW + 1] = {
	SYM_NULL, SYM_BEGIN, SYM_CALL, SYM_CONST, SYM_DO, SYM_END,
	SYM_IF, SYM_ODD, SYM_PROCEDURE, SYM_THEN, SYM_VAR, SYM_WHILE
};
int ssym[NSYM + 1] = {//
	SYM_NULL, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH,
	SYM_LPAREN, SYM_RPAREN, SYM_EQU, SYM_COMMA, SYM_PERIOD, SYM_SEMICOLON
};
                                         
//报错信息（相比上一篇文章随意地增加了0、26）
char* err_msg[] =
{
/*  0 */    "Fatal Error:Unknown character.\n",
/*  1 */    "Found ':=' when expecting '='.",
/*  2 */    "There must be a number to follow '='.",
/*  3 */    "There must be an '=' to follow the identifier.",
/*  4 */    "There must be an identifier to follow 'const', 'var', or 'procedure'.",
/*  5 */    "Missing ',' or ';'.",
/*  6 */    "Incorrect procedure name.",
/*  7 */    "Statement expected.",
/*  8 */    "Follow the statement is an incorrect symbol.",
/*  9 */    "'.' expected.",
/* 10 */    "';' expected.",
/* 11 */    "Undeclared identifier.",
/* 12 */    "Illegal assignment.",
/* 13 */    "':=' expected.",
/* 14 */    "There must be an identifier to follow the 'call'.",
/* 15 */    "A constant or variable can not be called.",
/* 16 */    "'then' expected.",
/* 17 */    "';' or 'end' expected.",
/* 18 */    "'do' expected.",
/* 19 */    "Incorrect symbol.",
/* 20 */    "Relative operators expected.",
/* 21 */    "Procedure identifier can not be in an expression.",
/* 22 */    "Missing ')'.",
/* 23 */    "The symbol can not be followed by a factor.",
/* 24 */    "The symbol can not be as the beginning of an expression.",
/* 25 */    "The number is too great.",
/* 26 */    "The identifier is too long",
/* 27 */    "",
/* 28 */    "",
/* 29 */    "",
/* 30 */    "",
/* 31 */    "",
/* 32 */    "There are too many levels."
};

source.txt

const a=10;    {常量声明}
const b=20;
var c;         {变量声明}
procedure p;   {过程声明}
     begin
          c:=b+a
     end;

begin
     call p
end.

感想

断断续续折腾了两周，参考了好多文章，走了山路十八弯😡
看到过一篇把转换表做成二维数组的，字符数*状态数，太多了，就没用，本质和switch是一样的！
DFA太好用了！！！😭 值得！！！
现在也许还有bug，欢迎交流指正~

my3.c 完善了一下（视频P3）

#include "my3.h"

void error(int n)
{
    err++;
    for (int i = 0; i < ll; i++)
    {
        printf("%c", line[i]);
    }
    printf("\n");

    for (int i = 1; i <= cc - 1; i++)
    {
        printf(" ");
    }
    printf("^\n");

    printf("Error %3d: %s\n", n, err_msg[n]);
}


// 从源代码读入一行到缓冲line，每次从line中读取一个字符
void getch()
{
    if (cc == ll) //如果缓冲line读完，再读入一行，更新line ll cc
    {
        ll = cc = 0;
        if (feof(fp)) //检测文件结束符
        {
            printf("\nPROGRAM INCOMPLETE\n");
            exit(1);
        }

        while (!feof(fp) && (ch = getc(fp)) != '\n') //将新的一行字符存入缓冲line
        {
            line[++ll] = ch;
        }
        lc++;
        line[++ll] = ' '; //用空格来代替回车，表示原文档里字符之间的分隔！！！调用此函数时不会有\n了
    }
    ch = line[++cc];
}

// 词法分析，读取一个单词
void getsym()
{
    while (ch == ' ' || ch == '\t' || ch == '\r' || ch == '{')
    {
        if (ch == '{')
        { //忽略注释
            int end = 1;
            while (end)
            {
                getch();
                if (ch == '}')
                    end = 0;
            }
        }
        getch();
    }
    if (ch == EOF)
    {
        exit(0);
    }

    if (isalpha(ch)) // 当前输入为字母,则应该为关键字或标识符
    {
        char a[MAXIDLEN + 1]; // 当前读取到的单词
        int k = 0;
        for (; (isalpha(ch) || isdigit(ch)) && k < MAXIDLEN; k++)
        {
            a[k] = ch;
            getch();
        }

        a[k] = '\0';   // 字符数组和字符串的区别就是结尾少了\0，一定要加上！
        strcpy(id, a); // 保存到全局变量id中，语法分析要用

        // 检查是否为关键字
        int i = 1;
        for (; i <= NRW; i++)
        {
            if (strcmp(a, word[i]) == 0)
                break;
        }
        if (i <= NRW)
        {
            sym = wsym[i]; // symbol is a reserved word
        }
        else
        {
            sym = SYM_IDENTIFIER; // symbol is an identifier
        }
    }
    else if (isdigit(ch))
    { // symbol is a number.
        sym = SYM_NUMBER;
        int k = num = 0;
        while (isdigit(ch))
        {
            num = num * 10 + ch - '0';
            getch();
            k++;
        }
        if (k > MAXNUMLEN)
            error(25); // The number is too great.
    }
    else if (ch == ':')
    {
        getch();
        if (ch == '=')
        {
            sym = SYM_BECOMES; // :=
            getch();
        }
        else
        {
            sym = SYM_NULL; // illegal?
        }
    }
    else if (ch == '>')
    {
        getch();
        if (ch == '=')
        {
            sym = SYM_GEQ; // >=
            getch();
        }
        else
        {
            sym = SYM_GTR; // >
        }
    }
    else if (ch == '<')
    {
        getch();
        if (ch == '=')
        {
            sym = SYM_LEQ; // <=
            getch();
        }
        else if (ch == '>')
        {
            sym = SYM_NEQ; // <>
            getch();
        }
        else
        {
            sym = SYM_LES; // <
        }
    }
    else
    { // other tokens : '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
        //代码和识别关键字那里类似
        int i = 1;
        for (; i <= NSYM; i++)
        {
            if (ch == csym[i])
                break;
        }
        if (i <= NSYM)
        {
            sym = ssym[i];
            getch();
        }
        //不应该出现的字符
        else
        {
            error(0);
            sym = SYM_NULL;
        }
    }
}

int main()
{
    fp = fopen("source_ANSI.txt", "r");
    if (!fp)
    {
        printf("文件不存在");
    }
    
    while(1){
        getch();
        printf("%c",ch);
    }

    return 0;
}

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define NRW 11			 // number of reserved words
#define MAXNUMLEN 14	 // maximum number of digits in numbers
#define NSYM 10			 // maximum number of symbols in array ssym and csym
#define MAXIDLEN 10		 // length of identifiers
#define TXMAX 500		 // length of identifier table
#define MAXLEVEL 32		 // maximum depth of nesting block
#define MAXINS 8		 // maximum number of instructions
#define CXMAX 500		 // size of code array
#define MAXADDRESS 32767 // maximum address
#define STACKSIZE 200	 // maximum storage

char ch = ' '; // last character read 最新读取到的字符
FILE *fp;	   // 文件指针

// 以下三个全局变量，是记录当前读取到的单词，构造符号表
// 在getsym()中被赋值
int sym;			   // last symbol read
char id[MAXIDLEN + 1]; // last identifier read
int num;			   // last number read

// 以下变量便于报错时知道是哪一行哪一列出错的
// 在getch()中用的到
int cc = 0;	   // character count (当前已读的字符个数)
int ll = 0;	   // line length
int lc = 1;	   // line count
int err = 0;   //error count
char line[80]; //输入缓冲

// 定义类别码
enum symtype
{
	SYM_NULL,
	SYM_IDENTIFIER,
	SYM_NUMBER,
	SYM_PLUS,
	SYM_MINUS,
	SYM_TIMES,
	SYM_SLASH,
	SYM_ODD,
	SYM_EQU,
	SYM_NEQ,
	SYM_LES,
	SYM_LEQ,
	SYM_GTR,
	SYM_GEQ,
	SYM_LPAREN,
	SYM_RPAREN,
	SYM_COMMA,
	SYM_SEMICOLON,
	SYM_PERIOD,
	SYM_BECOMES,
	SYM_BEGIN,
	SYM_END,
	SYM_IF,
	SYM_THEN,
	SYM_WHILE,
	SYM_DO,
	SYM_CALL,
	SYM_CONST,
	SYM_VAR,
	SYM_PROCEDURE
};

// 定义关键字
char *word[NRW + 1] = {
	"", /* place holder */
	"begin", "call", "const", "do", "end", "if",
	"odd", "procedure", "then", "var", "while"};
// 关键字对应的类别码
int wsym[NRW + 1] = {
	SYM_NULL, SYM_BEGIN, SYM_CALL, SYM_CONST, SYM_DO, SYM_END,
	SYM_IF, SYM_ODD, SYM_PROCEDURE, SYM_THEN, SYM_VAR, SYM_WHILE};

// 定义运算符和界符
char csym[NSYM + 1] = {
	' ', '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'};
// 运算符和界符对应的类别码
int ssym[NSYM + 1] = { //
	SYM_NULL, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH,
	SYM_LPAREN, SYM_RPAREN, SYM_EQU, SYM_COMMA, SYM_PERIOD, SYM_SEMICOLON};

// 定义标识符的类别码，这个是在符号表的kind中使用的码
enum idtype
{
	ID_CONSTANT,
	ID_VARIABLE,
	ID_PROCEDURE
};

// PL/0语言的目标代码的指令
char *mnemonic[MAXINS] = {
	"LIT", "OPR", "LOD", "STO", "CAL", "INT", "JMP", "JPC"};
// 指令对应的码
enum opcode
{
	LIT,
	OPR,
	LOD,
	STO,
	CAL,
	INT,
	JMP,
	JPC
};
// 指令里的操作码
enum oprcode
{
	OPR_RET,
	OPR_NEG,
	OPR_ADD,
	OPR_MIN,
	OPR_MUL,
	OPR_DIV,
	OPR_ODD,
	OPR_EQU,
	OPR_NEQ,
	OPR_LES,
	OPR_LEQ,
	OPR_GTR,
	OPR_GEQ
};

// 符号表将会对常量声明语句的信息进行存储
typedef struct
{
	char name[MAXIDLEN + 1];
	int kind;
	int value;
} comtab;

// 对变量and过程声明语句的信息进行存储
typedef struct
{
	char name[MAXIDLEN + 1];
	int kind;
	short level;   // 嵌套级别/层次差
	short address; // 存储位置的相对地址
} mask;

comtab table[TXMAX]; // 符号表table[500]，存储常量、变量、 过程

// 目标汇编代码code将会对执行语句的信息进行转换和存储
typedef struct
{
	int f; // function code
	int l; // level difference
	int a; // displacement address
} instruction;

instruction code[CXMAX]; //生成的汇编代码

int level = 0; // 嵌套级别。函数可以嵌套，主程序是0层，在主程序中定义的过程是1层，最多三层
int cx = 0;	   // index of current instruction to be generated.
int tx = 0;	   // index of table
int dx = 0;	   // data allocation index

//报错信息（相比上一篇文章随意地增加了0、26）
char *err_msg[] =
	{
		/*  0 */ "Fatal Error:Unknown character.\n",
		/*  1 */ "Found ':=' when expecting '='.",
		/*  2 */ "There must be a number to follow '='.",
		/*  3 */ "There must be an '=' to follow the identifier.",
		/*  4 */ "There must be an identifier to follow 'const', 'var', or 'procedure'.",
		/*  5 */ "Missing ',' or ';'.",
		/*  6 */ "Incorrect procedure name.",
		/*  7 */ "Statement expected.",
		/*  8 */ "Follow the statement is an incorrect symbol.",
		/*  9 */ "'.' expected.",
		/* 10 */ "';' expected.",
		/* 11 */ "Undeclared identifier.",
		/* 12 */ "Illegal assignment.",
		/* 13 */ "':=' expected.",
		/* 14 */ "There must be an identifier to follow the 'call'.",
		/* 15 */ "A constant or variable can not be called.",
		/* 16 */ "'then' expected.",
		/* 17 */ "';' or 'end' expected.",
		/* 18 */ "'do' expected.",
		/* 19 */ "Incorrect symbol.",
		/* 20 */ "Relative operators expected.",
		/* 21 */ "Procedure identifier can not be in an expression.",
		/* 22 */ "Missing ')'.",
		/* 23 */ "The symbol can not be followed by a factor.",
		/* 24 */ "The symbol can not be as the beginning of an expression.",
		/* 25 */ "The number is too great.",
		/* 26 */ "The identifier is too long",
		/* 27 */ "",
		/* 28 */ "",
		/* 29 */ "",
		/* 30 */ "",
		/* 31 */ "",
		/* 32 */ "There are too many levels."};

悄悄地把鞋带系好

关注

14
点赞
踩
36

收藏

觉得还不错? 一键收藏
8
评论
【编译原理】PL/0编译程序之词法分析 | DFA | C语言实现

核心遍历字符流识别单词的时候分为：1-标识符关键字,2-整数,3-复合运算符,4-单独字符单独字符直接识别，不进DFA了。注释和一些无效字符直接忽略‘’
复制链接

扫一扫