C语言实现简易词法分析器

最新推荐文章于 2024-07-02 22:22:32 发布

felicitia

最新推荐文章于 2024-07-02 22:22:32 发布

阅读量3.3w

点赞数 9

分类专栏：编译原理

本文链接：https://blog.csdn.net/felicitia/article/details/8109137

版权

编译原理专栏收录该内容

2 篇文章 0 订阅

订阅专栏

词法分析是编译的基础，需要对程序中的单词进行划分，并生成token文件（主要存符号表的入口地址，以便获取进一步需要的信息），供语法分析阶段使用。同时要生成符号表，包括变量的和常量的，在之后的分析中会不断的查填符号表，将单词的类型，值等各项信息填完整，才能进行运算等操作。采用了较为底层的C语言。结果由于好久没用过了，犯了许多低级的错误，编程过程比较艰辛，下面总结遇到的问题。

1、C语言严格区分字符和字符串，%c和%s不要混用，否则造成异常。Java中经常使用String或StringBuilder，淡化了字符的概念，在C中需要注意。

2、自定义的头文件要用双引号而不是尖括号。

3、'\n'光标移到下一行，在打印源程序时，头文件打印空行，看上去像多打印了一个'\n'，其实没错。

4、字符串操作不能直接赋值额，需要搞清楚char型数组和char*的不同。Strcpy是将内容进行拷贝，char型数组用Strcpy。而可以char*=字符串常量，字符串常量表示的是一个地址，可以让char型的指针直接指向该地址，避免空间浪费。

5、在不同的处理函数中，要生成token，并赋值，然后让token的指针的数组指向该位置。但由于函数内的token变量是局部变量，跳出函数则失效，故指针指向的位置的内容将有可能改变，导致在该函数之外往文件中写token数组时乱码。

6、fgets最后读到行末尾，'添加\0，判断时'不是'\n'

7、开辟的数组空间太小导致越界，也会导致乱码。

8、最费脑子的错误就是自己设计的headCh，作为每次识别的第一个字符。因为读文件的时候，文件指针会一直的向后移动，当时为了避免文件指针往回移动的时间开销，就用一个char类型的headCh来保存第一个字符。但是预处理的函数中，需要根据下一个字符判断是否结束预处理，故由headCh保存下一个字符，再读到buffer中时，buffer的第一个字符为下一行的第二个字符，即headCh+buffer的内容才是完整的一行的内容。所以为了不改变程序的处理过程，还要保持headCh始终在buffer[start]之前的一位，而处理程序是根据headCh的值来判断调用哪个处理函数的，故每次处理结束后需要给headCh和start重新赋值。

程序流程图：

Token：

错误处理：

源码：

main.c

#include "global.h"
#include <ctype.h>
int main()
{
    char fname[FSIZE];

    error = fopen("error.txt", "w+");
    if(error==NULL)
    {
        printf("cannot create error.txt!\n");
    }
    out = fopen("out.txt", "w+");
    if(out==NULL)
    {
        printf("cannot create out.txt!\n");
    }
    printf("please input filename: \n");
    scanf("%s", fname);
    in = fopen(fname, "r");
    if(in==NULL)
    {
        printf("error: cannot open file %s\n", fname);
        return -1;
    }
    headCh = predeal(in);

    while(fgets(buf, BSIZE, in)!=NULL)
    {
        head = headCh;
        int len = strlen(buf);
        buf[len-1] = '\0';
      //  printf("buf:$%s$\tline:%d\n", buf, line);
        start = 0;
        while(headCh != '\0' )
        {
            while(buf[start]==' ' && headCh == ' ')
            {
                start++;
            }
            if(headCh == ' ')
            {
                if(buf[start] == '\0')break;
                headCh = buf[start++];
            }
      //      printf("start = %d\n", start);
            // printf("headCH=%c\n",headCh);
            if(isalpha(headCh))
            {
         //       printf("ooooooo: DEAL ALPHA\n");
                dealAlpha();
            }
            else if(isdigit(headCh))
            {
        //        printf("ooooooo: DEAL DIGIT\n");
                dealDigit();
            }
            else if(headCh=='/')
            {
          //      printf("ooooooo: DEAL NOTATION\n");
                if(dealNotation()==-1)
                {
                    printf("notation too long to analyze, skip this line...\n");
                    fputc('\n', out);
                    break;
                }
            }
            else if(isBorder(headCh))
            {
     //           printf("ooooooo: DEAL BORDER\n");
                dealBorder();
            }
            else if(headCh=='\'' || headCh =='"')
            {
          //      printf("ooooooo: DEAL CHAR\n");
                dealChar(headCh);
            }
            else // not available start
            {
                fprintf(error, "L%d\tcannot analyse %c\n",line,headCh);
                headCh = buf[start];

            }
            start++;
            //    flag = 1;
        }
        line++;
        flag = 0;
        headCh = goBlank(in);
        if(isNotation==0)
        {
            fprintf(out, "%c%s\n", head, buf);
        }
        isNotation = 0;
    }


    puts("Everything has done...");
    writeToken();
    writeVarTable();
    writeConTable();
    fclose(in);
    fclose(out);
    fclose(error);
    printf("错误日志\t\t\terror.txt\n");
    printf("Token文件\t\t\ttoken.txt\n");
    printf("无注释头文件的源文件日志\tout.txt\n");
    printf("常量符号表\t\t\tconTable.txt\n");
    printf("变量符号表\t\t\tvarTable.txt\n");
    return 0;
}
/**预处理**/
char predeal(FILE *in)
{
    char ch;
    ch = goBlank(in);
    while(ch  == '#')
    {
        fgets(buf, BSIZE, in);
        dealInclude(buf);
        line++;
        fputc('\n', out);
        ch = goBlank(in);
    }
    printf("headers done...\n");
    return ch;
}

/** deal headers, like #include <...>**/
void dealInclude(char *buf)
{
    char include[15];
    char ch;
    int i=9;
    strncpy(include, buf, 9);
    include[9] = '\0';
    //printf("%s%d",include,strlen(include));
    if(strcmp(include, "include <")==0)
    {
        while((ch=buf[i])!='>')
        {
            i++;
            if(ch=='\n')
            {
                fprintf(error, "L%d\theaders end without '>'\n",line);
                break;
            }
        }
    }
    else
    {
        fprintf(error, "L%d\theaders format error\n",line);
    }

}

/**step blanks and count line number**/
char goBlank(FILE* in)
{
    char ch;
    do
    {
        ch = fgetc(in);
        if(ch=='\n')
        {
            line++;
          //  printf("goblank\n");
            fputc('\n', out);
        }
    }
    while(ch ==' ' || ch =='\n' || ch =='\t');
    return ch;
}

/**deal begin with alpha**/
void dealAlpha()
{
    int symbol;
    int id;
    char word[100];
    Token token;
    VarTable varTable;
    int i;
    word[0] = headCh;
    for(i=start; isdigit(buf[i])||isalpha(buf[i]); i++)
    {
        word[i-start+1] = buf[i];
    }
    word[i-start+1] = '\0';
    // forward = i;
    start = i;
    headCh = buf[start];


    symbol = isKeyword(word);
    /**not keyword**/
    if(symbol == -1)
    {
        /*  id = isInVarTable(word);
          if(id ==-1) //not in the varTable
          {
              varTable.id = varTableNum;
              strcpy(varTable.name, word);
              varTableArray[varTableNum] = varTable;
              varTableNum++;
              id = varTable.id;
          }*/

        varTable.id = varTableNum;
        strcpy(varTable.name, word);
        varTableArray[varTableNum] = varTable;
        varTableNum++;

        token.symbol = IDN;
        sprintf(token.attr, "%d", varTable.id);//change int to string
        strcpy(token.name,word);
        tokenArray[tokenNum] = token;
        tokenNum++;
    }
    /** is keyword**/
    else
    {
        token.symbol = symbol;
        strcpy(token.name,word);
        strcpy(token.attr,"--");
        tokenArray[tokenNum] = token;
        tokenNum++;
    }
}

/**判断是否是关键字**/
int isKeyword(char * word)
{
    int i;
    for(i=0; keywordList[i][0]; i++)
    {
        if(strcmp(word, keywordList[i])==0)
        {
            return i+256;
        }
    }
    return -1;
}

/**将token数组写入文件**/
void writeToken()
{
    FILE* ftoken;
    int i=0;
    ftoken = fopen("token.txt", "w+");
    if(ftoken==NULL)
    {
        printf("cannot create file token.txt!\n");
    }

    for(i=0; i<tokenNum; i++)
    {
        fprintf(ftoken, "%s\t\t(%d, %s)\n", tokenArray[i].name,tokenArray[i].symbol,tokenArray[i].attr);
    }
    fclose(ftoken);
}

/**将变量符号表写入文件**/
void writeVarTable()
{
    FILE* fvarTable;
    int i=0;
    fvarTable = fopen("varTable.txt", "w+");
    if(fvarTable==NULL)
    {
        printf("cannot create file varTable.txt!\n");
    }

    for(i=0; i<varTableNum; i++)
    {
        fprintf(fvarTable, "%s\t\t%d\n", varTableArray[i].name, varTableArray[i].id);
    }
    fclose(fvarTable);
}

/**将常量符号表写入文件**/
void writeConTable()
{
    FILE* fconTable;
    int i=0;
    fconTable = fopen("conTable.txt", "w+");
    if(fconTable==NULL)
    {
        printf("cannot create file conTable.txt!\n");
    }

    for(i=0; i<conTableNum; i++)
    {
        fprintf(fconTable, "%s\t\t%d\n", conTableArray[i].name, conTableArray[i].id);
    }
    fclose(fconTable);
}

/**deal with digit**/
void dealDigit()
{
    int symbol;
    int id;
    char word[100];
    Token token;
    ConTable conTable;
    int i;
    word[0] = headCh;
    for(i=start; isdigit(buf[i]); i++)
    {
        word[i-start+1] = buf[i];
    }
    if(buf[i]=='.')
    {
        if(!isdigit(buf[++i]))
        {
            start = i;
            headCh = buf[start];
            fprintf(error, "L%d\tunavailabe float\n",line);
            return;
        }
        word[i-start] = '.';
        for(; isdigit(buf[i]); i++)
        {
            word[i-start+1] = buf[i];
        }
        word[i-start+1] = '\0';
        // forward = i;
        start = i;
        headCh = buf[start];

        id = isInConTable(word);
        /**不在常量表里，新加项**/
        if(id==-1)
        {
            conTable.id = conTableNum;
            strcpy(conTable.name, word);
            conTableArray[conTableNum] = conTable;
            conTableNum++;
            id = conTable.id;
        }

        token.symbol = FNUM;
        sprintf(token.attr, "%d", id);//change int to string
        strcpy(token.name,word);
        tokenArray[tokenNum] = token;
        tokenNum++;
    }
    else
    {
        word[i-start+1] = '\0';
        // forward = i;
        start = i;
        headCh = buf[start];

        id = isInConTable(word);
        /**不在常量表里，新加项**/
        if(id==-1)
        {
            conTable.id = conTableNum;
            strcpy(conTable.name, word);
            conTableArray[conTableNum] = conTable;
            conTableNum++;
            id = conTable.id;
        }

        token.symbol = INUM;
        sprintf(token.attr, "%d", id);//change int to string
        strcpy(token.name,word);
        tokenArray[tokenNum] = token;
        tokenNum++;
    }

}

/**处理界符**/
void dealBorder()
{
    Token token;
    char s[3];
    int i;
    s[0] = headCh;
    s[1] = buf[start];
    s[2] = '\0';
    if(s[1] != '\0')
    {
        //deal two border
        for(i = 0 ; borderList[i][0] ; i++)
        {
            if(strcmp(s, borderList[i]) == 0)
            {
                strcpy(token.name, s);
                token.symbol = i+400;
                strcpy(token.attr,"--");
                tokenArray[tokenNum++] = token;
                start++;
                headCh = buf[start];
                return;
            }
        }
    }
    /**处理单界符**/
    s[1] = '\0';
    for(i = 0 ; borderList[i][0] ; i++)
    {
        if(strcmp(s, borderList[i]) == 0)
        {

            strcpy(token.name, s);
            token.symbol = i+400;
            strcpy(token.attr,"--");
            tokenArray[tokenNum++] = token;

            //  start++;
            headCh = buf[start];
            return;
        }
    }
}

int isBorder(char ch)
{
    int i;
    for(i=0; borderList[i][0]; i++)
    {
        if(ch==borderList[i][0])
        {
            return 1;
        }
    }
    return 0;
}

/**处理注释**/
int dealNotation()
{
    char ch = buf[start];
    int i;
    int notationLen=0;
    Token token;
    //printf("dealNotation: ch = %c\n", ch);
    /**除号处理**/
    if(ch!='/'&&ch!='*')
    {
        strcpy(token.name,"/");
        strcpy(token.attr,"--");
        token.symbol = DIV;
        tokenArray[tokenNum++] = token;
        start++;
        headCh = buf[start];
        return 0;
    }
    if(ch=='/')
    {
        fputc('\n',out);
        isNotation = 1;
        headCh='\0';
        return 0;
    }
    else if(ch=='*')
    {
        for(i=start+1;; i++)
        {
            if(buf[i]=='\0')
            {
                fgets(buf, BSIZE, in);
                line++;
                fputc('\n', out);
                start = 0;
                headCh = '\0';
                i = -1;
            }
            else
            {
                while(!(buf[i]=='*'&&buf[i+1]=='/'))
                {
                    i++;
                    if(++notationLen == LIMIT_NOTATION)
                    {
                        fprintf(error, "L%d\tnotation too long\n", line);
                        isNotation = 1;
                        return -1;
                    }
                 //   printf("notationLen = %d\n", notationLen);
                    if(buf[i+1]=='\0')
                    {
                        fgets(buf, BSIZE, in);
                        line++;
                        fputc('\n', out);
                        i = 0;
                    }
                }
                fputc('\n', out);
                isNotation = 1;
                start = i + 2;
                headCh = buf[start];
                return 0;
            }
        }

    }
}

/**处理字符常量**/
void dealChar(char ch)
{
    Token token;
    ConTable conTable;
    int i = start;
    int id;
    int j;
    char word[100];
    word[0] = ch;
    if(ch == '\'')//const char
    {
        if(buf[i]=='\\')//change char
        {
            for(j=0; j<12; j++)
            {
                if(buf[i+1]==changeList[j])
                {
                    word[1] = '\\';
                    word[2] = buf[i+1];
                    word[3] = '\'';
                    word[4] = '\0';
                    strcpy(token.name, word);
                    strcpy(token.attr, "--");
                    token.symbol = j+500;

                    tokenArray[tokenNum++] = token;
                    start = i+3;
                    headCh = buf[start];
                    return;
                }
            }
            /**error: 转义字符不合法**/
            if(j==12)
            {
                fprintf(error, "L%d\tunavailable change char\n", line);
            }
        }
        else if(buf[i+1] != '\'')
        {
            fprintf(error, "L%d\tthe length of const char is unavailabe\n", line);
            for(i = i+2; buf[i]!='\''; i++);
            start = i +1;
            headCh = buf[start];
            return;
        }
        else
        {
            word[1] = buf[i];
            word[2] = '\'';
            word[3] = '\0';
            id = isInConTable(word);
            /**不在常量表里，新加项**/
            if(id==-1)
            {
                conTable.id = conTableNum;
                strcpy(conTable.name, word);
                conTableArray[conTableNum++] = conTable;
                id = conTable.id;
            }

            token.symbol = CCHAR;
            sprintf(token.attr, "%d", id);//change int to string
            strcpy(token.name,word);
            tokenArray[tokenNum++] = token;
            start = i+2;
            headCh = buf[start];
            return;
        }
    }
    else if(ch == '"')//字符串常量
    {
        for(; buf[i]!='"'; i++)
        {
            word[i-start+1] = buf[i];
          //  printf("buf%d:%s\n", i, buf);

        }
        word[i - start + 1] = '"';
        word[i - start + 2] = '\0';

        id = isInConTable(word);
        /**不在常量表里**/
        if(id==-1)
        {
            conTable.id = conTableNum;
            strcpy(conTable.name, word);
            conTableArray[conTableNum++] = conTable;
            id = conTable.id;
        }

        token.symbol = CSTR;
        sprintf(token.attr, "%d", id);//change int to string
        strcpy(token.name,word);
        tokenArray[tokenNum++] = token;
        start = i + 1;
        headCh = buf[start];
    }
}

/**判断是否在变量符号表中，返回位置，不在返回-1**/
int isInVarTable(char *name)
{
    int i;
    for(i=0; i<varTableNum; i++)
    {
        if(strcmp(name, varTableArray[i].name)==0)
        {
            return varTableArray[i].id;
        }
    }
    return -1;
}

/**判断是否在常量符号表中，返回位置，不在返回-1**/
int isInConTable(char *name)
{
    int i;
    for(i=0; i<conTableNum; i++)
    {
        if(strcmp(name, conTableArray[i].name)==0)
        {
            return conTableArray[i].id;
        }
    }
    return -1;
}

global.h

#ifndef GLOBAL_H_INCLUDED
#define GLOBAL_H_INCLUDED

#include <stdio.h>
#include <string.h>

#define BSIZE   1024
#define FSIZE   50
#define TSIZE   1024
#define VTSIZE 1024
#define CTSIZE  1024
#define LIMIT_NOTATION 10

/**define keyword**/
#define INCLUDE         256
#define AUTO                 257
#define BREAK                258
#define CASE                259
#define CHAR                260
#define CONST               261
#define CONTINUE        262
#define DEFAULT             263
#define DO                      264
#define DOUBLE                  265
#define ELSE                    266
#define ENUM                267
#define EXTERN                      268
#define FLOAT               269
#define FOR                     270
#define GOTO                271
#define IF                      272
#define  INT                 273
#define  LONG                 274
#define REGISTER            275
#define RETURN             276
#define SHORT              277
#define SIGNED             278
#define SIZEOF           279
#define STATIC         280
#define STRUCT          281
#define SWITCH       282
#define TYPEDEF     283
#define UNION           284
#define UNSIGNED    285
#define VOLATILE     286
#define WHILE          287

/**define variables and consts**/
#define IDN                 300
#define INUM               301
#define FNUM                302
#define CCHAR               303
#define CSTR                 304

/**define border**/
#define PLUS                  400
#define MINUS                   401
#define MUL                     402
#define DIV                   403
#define REM                     404
#define SEMI                   405
#define  COM                    406
#define  BLP                    407
#define  BRP                        408
#define  SRP 409
#define  SLP 410
#define  BIG 411
#define  SML 412
#define  EQU 413
#define  MLP 414
#define  MRP 415
#define  COL 416
#define  QUE 417
#define  SIG 418
#define  NOT 419
#define  AND 420
#define  OR  421
#define  PP 422
#define  MM 423
#define  EQEQ 424
#define  NOTL 425
#define  NOTR 426
#define  DECL 427
#define  BIGE 428
#define  SMLE 429
#define  NOTE 430
#define  AA 431
#define  OO 432
#define  ANDE 433
#define  MINUE 434
#define  MULE 435
#define  DIVE 436
#define  XOR 437
#define  RIGHT 438
#define  LEFT 439
#define  TURN 440

/**define change char**/
#define CA 500
#define CB 501
#define CF 502
#define CN 503
#define CR 504
#define CT 505
#define CV 506
#define CBSL 507
#define CQUE 508
#define CDQM 509
#define  CQM 510
#define ZERO 511

/**structs**/
typedef struct varTable
{
    int id;
    char name[100];
} VarTable;

typedef struct conTable
{
    int id;
    char name[100];
} ConTable;

typedef struct token
{
    char name[100];
    int symbol;
    char attr[100];
} Token;

/**variables**/
FILE *error;
FILE *out;
FILE *in;

int line=1;
int tokenNum=0;
int varTableNum=0;
int conTableNum=0;
int start = 0;
int forward = 0;
int flag =0;
int isNotation=0;
char buf[BSIZE];
Token tokenArray[TSIZE];
VarTable varTableArray[VTSIZE];
ConTable conTableArray[CTSIZE];
char headCh;
char head;
char *borderList[] = { "+","-","*","/","%",";",",","{","}",")","(",">","<","=","[","]",":","?","!","&","|",
                       "++","--","==","/*","*/",":=",">=","<=","!=","&&","||","+=","-=","*=","/=","^",">>","<<","~"
                     };//by order unnecessary notation
char *keywordList[] = {"include","auto","break","case","char","const","continue","default","do","double",
                       "else","enum","extern","float","for","goto","if","int","long","register",
                       "return","short","signed","sizeof","static","struct","switch","typedef",
                       "union","unsigned","volatile","while",""
                      };//by order
char changeList[12] = {'a', 'b', 'f','n','r','t','v','\\','?','"','\'','0'};

/**Functions**/
char goBlank(FILE* in);
char predeal(FILE* in);
void dealInclude(char* in);
void dealAlpha();
void dealDigit();
void dealBorder();
int dealNotation();
void dealChar(char ch);
int isKeyword(char *word);
void writeToken();
void writeVarTable();
void writeConTable();
int isBorder(char ch);
int isInVarTable(char *name);
int isInConTable(char *name);



#endif // GLOBAL_H_INCLUDED

程序说明：

1、不考虑自定义头文件，#include < 规定这样开头，尖括号前面只能空一格。
2、标识符长度< 100,否则越界。
3、字符(串)常量，长度不可超过1，但需要结尾。
4、注释会整行打印空行。

5、错误文件输出到error.txt；常量符号表：conTable.txt；变量符号表：varTable.txt；无头文件和注释的程序out.txt；token输出到token.txt

6、错误处理包括：不合法的浮点数，注释不封闭，转义字符不存在，头文件包含错误，程序中有非法字符。

7、为了显示注释不封闭的错误处理，定义超过10个字符的“/*”没有结尾就算不封闭。可以在global.h中进行更改，缓冲区大小，文件名长度，token, conTable, varTable的数组长度都可定义。由于struct中不是指针，比较占用空间，故不可分析太大的程序，容易数组越界。