编译原理实验（一）—— 源程序的预处理及词法分析程序的设计与实现（C语言实现）-CSDN博客

本文链接：https://blog.csdn.net/weixin_44792920/article/details/122450354

本文介绍了一个包含预处理功能的词法分析程序设计与实现。通过C语言编程，实现了对源程序进行预处理和词法分析的功能，能够识别关键字、标识符、数字、字符串等多种词法元素。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

大家好，下面介绍的是我当时上编译原理所做的实验，主要内容就是先对源程序进行预处理，然后再对处理过的程序进行词法分析。用的是最基本的C语言写的，如有不足，欢迎大家批评指正！

一、实验目的

设计并实现一个包含预处理功能的词法分析程序，加深对编译中词法分析过程的理解。

二、实验要求

1、实现预处理功能

源程序中可能包含有对程序执行无意义的符号，要求将其剔除。
首先编制一个源程序的输入过程，从键盘、文件或文本框输入若干行语句，依次存入输入缓冲区（字符型数据）；然后编制一个预处理子程序，去掉输入串中的回车符、换行符和跳格符等编辑性文字；把多个空白符合并为一个；去掉注释。
2、实现词法分析功能

输入：所给文法的源程序字符串。
输出：二元组（syn,token或sum）构成的序列。其中，
syn为单词种别码。
Token为存放的单词自身字符串。
Sum为整型常量。
具体实现时，可以将单词的二元组用结构进行处理。

三、实验设计

1）首先编写一个预处理子程序，用于读取文件并且去除文件中的回车换行、将多个空格合并成一个空格
2）然后再编写一个分析子程序，用于分析经过处理后的程序，并且识别枚举类型和共用体类型,识别struct,enum,识别&&和||、++和–,==、识别!+,-=,+=、识别浮点数、识别指数、识别指针变量、识别字符串、去掉多行注释、识别错误信息，数字后面跟有字母，123fds(这个有个bug，识别不了16进制的数字,比如123efh，他其实是一个16进制数，但是也会当作错误信息)、对处理程序的大小没有限制
3)显示处理过的程序并保存在相应的文件中

四、代码实现及运行效果

运行效果
在这里插入图片描述
代码由三个文件构成，如下图所示

其中，Hong.h中主要是对一下常用的变量进行宏定义，fun.h中是对各种方法的实现，main.c则是对fun.h中实现的方法的简单的调用，一些需要注意的地方都在代码中以注释的形式展现，话不多说，上代码！

Hong.h

#include <stdio.h>
#define SizeRes 60
#define Sizestr 20
char ch;                                          //字符变量，存访最新读进的源程序字符
char strToken[Sizestr];                           //字符数组，存访构成单词符号的字符串
int GetBC();                                      //子程序过程，检查ch中的字符是否为空白，若是，则调用GetChar()，直至ch中进入一个非空白字符
int IsLetter();                                   //布尔函数过程，判断ch是否为字母
int IsDigit();                                    //布尔函数过程额，判断ch是否是数字
char const *FindRes(char str[]);                        //确定是否为关键字，如果是关键字返回其类型
int IsIdentifier();                               //判断是否是标识符的组成
int IsFloat();                                    //判断是否是浮点数的组成
char *IsIntorFloatorExponent(char string[]);      //判断是整数还是浮点数还是指数
int IsError();                                    //判断数字后面的东西
FILE *InserRes(char const *type, char const *value, FILE *p); //向result文件中输出信息，并返回文件指针
void Yu(char F_open[], char F_out[]);             //预处理程序

//结构体数组
struct s1
{
    char const *type;
    char const *name;
} ReservedWord[SizeRes] = {
    {"1", "main"}, {"2", "if"}, {"3", "then"}, {"4", "while"}, {"5", "do"}, 
    {"6", "static"}, {"7", "int"}, {"8", "double"}, {"9", "struct"}, {"10", "break"}, 
    {"11", "else"}, {"12", "long"}, {"13", "swith"}, {"14", "case"}, {"15", "typedef"}, 
    {"16", "char"}, {"17", "return"}, {"18", "const"}, {"19", "float"}, {"20", "short"}, 
    {"21", "continue"}, {"22", "for"}, {"23", "void"}, {"24", "default"}, {"25", "sizeof"}, 
    {"26", "enum"}, {"27", "union"}, //1、新加的枚举类型和共用体类型
    {"28", "+"},{"29", "-"},{"30", "*"},{"31", "/"},{"32", ":"},{"33", ":="},{"34", "<"},
    {"35", "<>"},{"36", "<="},{"37", ">"},{"38", ">="},{"39", "="},{"40", ";"},{"41", "("},
    {"42", ")"},{"43", "||"},{"44", "&&"},{"45", "{"},{"46", "}"}, //2、新加或运算和与运算
    {"47", "++"},{"48", "--"},{"49", ","},{"50", "=="},{"51", "["},{"52", "]"},{"53", "#"},
    {"54", "include"},{"55", "define"},{"56", "&"},{"57", "."},{"58", "+="},{"59", "-="},
    {"60", "!="} //3、++和--和==
};

fun.h

#include <string.h>
#include "Hong.h"
//确定是否为关键字，如果是关键字返回其类型
char const *FindRes(char str[])
{
  for (int i = 0; i < SizeRes; i++)
  {
    if (strcmp(str, ReservedWord[i].name) == 0)
      return ReservedWord[i].type;
  }
  return "ID";
}

//判断ch是否是字母
int IsLetter()
{
  if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))
    return 1;
  return 0;
}

//判断是否是标识符的组成
int IsIdentifier()
{
  if (IsLetter() || IsDigit() || ch == '_')
    return 1;
  return 0;
}

//判断是否是浮点数的组成
int IsFloat()
{
  if (IsDigit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '-')
    return 1;
  return 0;
}
//判断是整数还是浮点数还是指数
char *IsIntorFloatorExponent(char string[])
{
  for (int i = 0; string[i] != '\0'; i++)
  {
    if (string[i] == '.')
      return "float";
    if (string[i] == 'e' || string[i] == 'E')
      return "Exponent";
  }
  return "int";
}

//判断ch是否是数字
int IsDigit()
{
  if (ch >= '0' && ch <= '9')
    return 1;
  return 0;
}

//判断数字后面的东西
int IsError()
{
  if (IsLetter() || IsDigit())
    return 1;
  return 0;
}

//检查ch中是否为空白，如果是，将指针移到第一个不是的地方
int GetBC(char str[], int i)
{
  while (str[i] != '\0' && str[i] == ' ')
  {
    i++;
  }
  return i;
}

//向result文件中输出信息，并返回文件指针
FILE *InserRes(char const *type, char const *value, FILE *p)
{
  fputs(type, p);
  fputc(',', p);
  fputc('\t', p);
  fputs(value, p);
  fputc('\n', p);
  return p;
}

//进行单词分割和区分
void Process(char F_open[], char Result_file[], char error_file[])
{
  FILE *fp;
  fp = fopen(F_open, "r");
  FILE *res_file;
  res_file = fopen(Result_file, "w");
  FILE *err_file;
  err_file = fopen(error_file, "w");
  int size = 20;
  char L1[size];
  char temp[size];
  int index = 0;
  int WritedFlag = 0;
  while (fgets(L1, size, fp) != NULL)
  {
    int i = 0;
    while (L1[i] != '\0')
    {
      ch = L1[i];
      if (IsDigit())
      {
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0') //读到第一个数字，但是是最后一个字符的情况
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        while (IsFloat())
        {
          strToken[index] = ch;
          i++;
          index++;
          if (L1[i] == '\0') //如果找到最后，数字被截断了，就把标志状态改一下
          {
            fgets(temp, size, fp);
            strcpy(L1, temp);
            i = 0;
          }
          ch = L1[i];
        }
        if (IsLetter())
        {
          strToken[index] = ch;
          i++;
          index++;
          if (L1[i] == '\0')
          {
            fgets(temp, size, fp);
            strcpy(L1, temp);
            i = 0;
          }
          ch = L1[i];
          while (IsError())
          {
            strToken[index] = ch;
            i++;
            index++;
            if (L1[i] == '\0')
            {
              fgets(temp, size, fp);
              strcpy(L1, temp);
              i = 0;
            }
            ch = L1[i];
          }
          strToken[index] = '\0';
          printf("Errors!    ");
          printf("%s\n", strToken);
          fputs(strToken, err_file);
          fputc('\n', err_file);
          res_file = InserRes("error", strToken, res_file);
          WritedFlag = 1;
        }
        else
        {
          strToken[index] = '\0';
          printf("<%s\t,\t%s>\n", IsIntorFloatorExponent(strToken), strToken);
          res_file = InserRes(IsIntorFloatorExponent(strToken), strToken, res_file); //向结果result文件中输出信息
          WritedFlag = 1;
        }
      }
      else if (IsLetter())
      {
        ch = L1[GetBC(L1, i)];
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        while (IsIdentifier())
        {
          strToken[index] = ch;
          i++;
          index++;
          if (L1[i] == '\0')
          {
            fgets(temp, size, fp);
            strcpy(L1, temp);
            i = 0;
          }
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == '#')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == '+')
      {
        strToken[index] = ch;
        index++;
        i++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '+')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == '-')
      {
        strToken[index] = ch;
        index++;
        i++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '-')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == '!')
      {
        strToken[index] = ch;
        index++;
        i++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == '*') //识别乘号和指针变量
      {
        if (L1[i - 1] == ' ' || L1[i - 1] == ';')
        {
          ch = L1[GetBC(L1, i)];
          strToken[index] = ch;
          i++;
          index++;
          if (L1[i] == '\0')
          {
            fgets(temp, size, fp);
            strcpy(L1, temp);
            i = 0;
          }
          ch = L1[i];
          while (IsIdentifier())
          {
            strToken[index] = ch;
            i++;
            index++;
            if (L1[i] == '\0')
            {
              fgets(temp, size, fp);
              strcpy(L1, temp);
              i = 0;
            }
            ch = L1[i];
          }
          strToken[index] = '\0';
        }
        else
        {
          strToken[index] = ch;
          index++;
          strToken[index] = '\0';
          i++;
          ch = L1[i];
        }
      }
      else if (ch == '/')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == ':')
      {
        strToken[index] = ch;
        i++;
        index++;
        ch = L1[i];
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
          strToken[index] = '\0';
        }
        else
        {
          strToken[index] = '\0';
        }
      }
      else if (ch == '<')
      {
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '>')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
          //strToken[index] = '\0';
        }
        else if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
          //  strToken[index] = '\0';
        }
        strToken[index] = '\0';
      }
      else if (ch == '>')
      {
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == '=')
      {
        strToken[index] = ch;
        index++;
        i++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '=')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else if (ch == ';')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == '(')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == ')')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == '{')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == '}')
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      else if (ch == '|')
      {
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '|')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
          strToken[index] = '\0';
        }
      }
      else if (ch == '"')
      {
        i++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        while (ch != '"')
        {
          strToken[index] = ch;
          i++;
          index++;
          if (L1[i] == '\0')
          {
            fgets(temp, size, fp);
            strcpy(L1, temp);
            i = 0;
          }
          ch = L1[i];
        }
        strToken[index] = '\0';
        res_file = InserRes("string", strToken, res_file);
        WritedFlag = 1;
        i++;
        ch = L1[i];
      }
      else if (ch == '&')
      {
        strToken[index] = ch;
        i++;
        index++;
        if (L1[i] == '\0')
        {
          fgets(temp, size, fp);
          strcpy(L1, temp);
          i = 0;
        }
        ch = L1[i];
        if (ch == '&')
        {
          strToken[index] = ch;
          i++;
          index++;
          ch = L1[i];
        }
        strToken[index] = '\0';
      }
      else
      {
        strToken[index] = ch;
        index++;
        strToken[index] = '\0';
        i++;
        ch = L1[i];
      }
      if ((!WritedFlag) && (strcmp(" ", strToken) != 0))
      {
        res_file = InserRes(FindRes(strToken), strToken, res_file);
        printf("<%s\t,\t%s>\n", FindRes(strToken), strToken);
      }
      WritedFlag = 0; //找出一个字符串就要置0
      index = 0;
      memset(strToken, 0, sizeof strToken);
    }
  }
  printf("finish!\n");
}

//预处理程序
void Yu(char F_open[], char F_out[])
{
  FILE *fp;
  fp = fopen(F_open, "r"); //从该文件中读取源程序
  FILE *op;
  op = fopen(F_out, "w"); //将处理源程序的结果放在该文件中
  if (fp != NULL)
  {
    char ch = fgetc(fp);
    while (ch != EOF)
    {
      switch (ch)
      {
      case '\n': //去掉换行，读取文件的时候，自动将'\r''\n'合并成了'\n'，所以下面的case '\r'可以省略
        ch = fgetc(fp);
        break;
      case '\r': //去掉回车
        ch = fgetc(fp);
        break;
      case ' ': //将多个空格合并成一个
        printf("%c", ch);
        fputc(ch, op);
        ch = fgetc(fp);
        while (ch != EOF && ch == ' ')
        {
          ch = fgetc(fp);
        }
        break;
      case '/': //去掉多行注释和单行注释
        ch = fgetc(fp);
        if (ch == '/') //去掉单行注释
        {
          ch = fgetc(fp);
          while (ch != EOF && ch != '\n')
          {
            ch = fgetc(fp);
          }
          ch = fgetc(fp); //while结束时ch等于回车换行，所以再往后读一个字符
        }
        else if (ch == '*') //去掉多行注释
        {
          ch = fgetc(fp);
          while (1)
          {
            while (ch != '*')
            {
              ch = fgetc(fp);
            }
            ch = fgetc(fp);
            if (ch == '/')
            {
              ch = fgetc(fp); //向后读一个字符，然后退出while循环
              break;
            }
          }
        }
        else
        {
          printf("%c", '/');
          fputc('/', op);
        }
        break;
      case '\t': //去掉tab
        ch = getc(fp);
        break;
      default:
        printf("%c", ch);
        fputc(ch, op);
        ch = fgetc(fp);
        break;
      }
    }
  }
  else
  {
    printf("the file can not open\n");
  }
  fclose(fp);
  fclose(op);
  printf("\n\nYu finish!\n\n");
}

mian.c

大家可以自行新建文本文件，只需要把对应的文件地址改一下即可，具体说明在注释中

#include "fun.h"
/*
1、新加的枚举类型和共用体类型,识别struct,enum
2、新加或运算和与运算,识别&&和||
3、++和--,==
4、识别!+,-=,+=
5、识别浮点数
6、识别指数
7、识别指针变量
8、识别字符串
9、去掉多行注释
10、识别错误信息，数字后面跟有字母，123fds(这个有个bug，识别不了16进制的数字,比如123EFH，他其实是一个16进制数，但是也会当作错误信息)
11、对处理程序的大小没有限制
*/
int main()
{
  Yu("E:\\test\\shiyan1\\test.txt", "E:\\test\\shiyan1\\out.txt");//传入文件地址即可，第一个文件里面存的是要处理的源程序，第二个文件存的是预处理过后的程序
  Process("E:\\test\\shiyan1\\out.txt", "E:\\test\\shiyan1\\result.txt", "E:\\test\\shiyan1\\errors.txt");//第一个文件是处理过后的程序，第二个文件存的是最终的结果，第三个文件存的是错误信息
  return 0;
}