编译原理实验：词法分析（C++）

晨曦✘

已于 2024-08-14 13:43:49 修改

阅读量780

点赞数 10

文章标签： c++ 开发语言

于 2024-08-14 13:35:45 首次发布

本文链接：https://blog.csdn.net/qq_38304433/article/details/141188208

版权

当时为了各模块之间分明，我的标识符识别和词法分析函数细看是有些冗余的，之后写完后怕麻烦就没做修改，你可以试着把标识符和词法分析那两个函数结合一下，直接在标识符识别里面完成词法分析的要求都行。代码中我的注释应该也挺清晰的。

1.预处理

【任务介绍】在词法分析之前，对程序员提交的源程序进行预处理，剔除注释等不必要的字符，以简化词法分析的处理

【输入】字符串形式的源程序。

【输出】处理之后的字符串形式的源程序。

【题目】设计一个程序，从任意字符串中剔除C语言形式的注释，包括：

（1）形如： //…的单行注释；

（2）形如： /*…*/ 的多行注释。

2.标识符的识别

【任务介绍】根据给定源语言的构词规则，从任意字符串中识别出所有的合法标识符。

【输入】字符串。

【输出】单词符号流，一行一个单词。

【题目】设计一个程序，从任意字符串中识别出所有可视为C语言“名字”的子串。注意：

（1）构词规则：以字母打头，后跟任意多个字母、数字的单词；长度不超过15；不区分大小写；把下划线视为第27个字母。

（2）关键字保留，即：语言定义中保留了某些单词用作关键字，同学们不可以将这些单词用作“名字“（变量名、常量名、函数名、标号名等等）。

3.词法分析

【任务介绍】根据给定源语言的构词规则，从任意字符串中识别出该语言所有的合法的单词符号，并以等长的二元组形式输出。

【输入】字符串形式的源程序。

【输出】单词符号所构成的串（流），单词以等长的二元组形式呈现。

【题目】设计一个程序，根据给定源语言的构词规则，从任意字符串中识别出该语言所有的合法的单词符号，并以等长的二元组形式输出。注意：

（1）附录A中介绍了一个基于C语法规则设计的源语言LittleC（我使用的）和一个基于Pascal语法规则设计的源语言LittleP，可以作为参考。

（2）同学们可以自行挑选或设计一种源语言，以此为基础来完成本实验和后续实验。该语言的设计应该满足附录B的要求。

（3）该程序应该设计为至少包含2个模块：驱动模块和工作模块。驱动模块包含了程序的入口和出口，主要负责输入、输出处理并调用工作模块；工作模块负责具体的分割、识别、归类等工作。这样做的好处是：只要模块间的接口（需要传递哪些数据，数据的结构）设计合理，后续实验中做语法分析器时就可以直接调用此处的工作模块，而不需要改动太多代码。

4.代码展示

注：所调用的文件内容为C语言所写（就是input.txt里面的内容），且调用的文法里面只需要包含下述的那些三种关键字，运算符，分隔符即可，详细的解释我写在代码中写了哦。

#include <iostream>
#include <fstream>
#include <queue>
#include <algorithm> // 包含 std::find
#include <cstring>
using namespace std;


// 词法分析表
struct Analysis {    
    int type;             // 类型编号
    string array;        // 实际存储
};
Analysis a_struct[1024];


//可识别的C语言的关键字全局变量
static char keywords[4][6] = { "int","if","else","while"};

//可识别的C语言的运算符全局变量
static char operators[11][3] = { "+", "-", "*", "/", "=", ">", "<", ">=", "<=", "==", "!=" };
//可识别的C语言的分隔符全局变量
static char delimiters[6] = { '{', '}', '(', ')', ',', ';'};  //分隔符,共6个


//调用的函数模块
void preprocess(ifstream& fin, ofstream& fout);
void recognize_identifier(char space1[], int count);
void Lexical_analysis(char (&temp)[1024][30]);


int main() {
    ifstream fin("input.txt");
    ofstream fout("output.txt");
    if (!fin.is_open() || !fout.is_open()) {
        cout << "文件不存在!";
        return 0;
    }

    preprocess(fin, fout);
    fin.close();
    fout.close();

    return 0;

}


//预处理操作
void preprocess(ifstream& fin, ofstream& fout) {
    char total[1024] = { '\0' }, a, b; // a为读取的第一个字符，b为a之后的字符
    fin.read(total, sizeof(total)); // 将文件读取到的原始内容存入total数组
    int count = fin.gcount(); // the total number of characters read from the file
    int count1 = count;
    char space1[1024] = { '\0' }; // 初始化为全零，将初始化的文件内容存入space1数组中
    int i = 0, j = 0;
    int number = 0; // 进行选择判断：0为无注释；1为单行注释；2为多行注释
    bool in_quotes = false; // 是否在引号内部的标志

    while (count > 0) { 
        a = total[i];
        b = total[i + 1];
        // if(i == count1){

        // }
        // 检查是否在引号内
        if (a == '\'' || a == '\"') {
            in_quotes = !in_quotes;
        }

        if (!in_quotes) {
            switch (number) {
            case 0:
                if (a == '/' && b == '/') {
                    number = 1;
                }
                else if (a == '/' && b == '*') {
                    if (total[i + 2] == '/')//当遇见"/*/"类型，i再递增一位即可
                    {
                        i++;
                    }
                    number = 2;
                }
                else {
                    space1[j++] = total[i]; // 将字符存入 space1 数组中，并递增 j
                }
                break;
            case 1:
                if (total[i] == '\n') {
                    number = 0;
                    space1[j++] = '\n'; // 单行注释结束后加入换行符
                }
                break;
            case 2:
                if (a == '*' && b == '/') {
                    number = 0;
                    i += 1;
                }
                break;
            }
        }
        else {
            space1[j++] = total[i]; // 将字符存入 space1 数组中，并递增 j
        }

        i++; // 将 i 的递增放在循环内部
        count--;

    }
    /*出现一个问题，这个代码如果在VS里面运行，不会出现最后会多一个字符，但是在vscode里面就会出现，暂直接将末尾置空*/
    j--;
    space1[j] = '\0';

    //cout<< space1<<endl;
    fout << space1; // 输出 space1 中的内容
    recognize_identifier(space1, count1); // 进行标识符的识别
    
}


//标识符识别
/*首先建立一个队列，有栈顶栈底指针，每次遇到空格、换行符或其他不合法字符就出栈，将出队列的数据存入另一个基本数组；
  然后对得到的基本数组进行判断，看是否它的属于关键字与长度是否符合要求，是否属于正规标识符，然后合法就输出。
*/

// 外置函数，用于处理队列中的元素（只是在下列标识符识别中多次使用，为简洁，所以单独提出）
void ProcessQueueItems(queue<char>& q1, char temp[1024][30], int& j, int& num1, int& num) {
    for (; num > 0; num--) {
        temp[j][num1++] = q1.front();//始终保持以队列的首部值存入temp数组中
        q1.pop();//将队列中的值逐个出队
    }
    j++;
    num1 = 0;
    num = 0;
}

void recognize_identifier(char space1[1024], int count) {
    ofstream fout1("output_identifier.txt");//此文件为第二个输出文件（主要保存输出的合法标识符）
    queue<char>q1;
    space1[count] = { '\0' };//在space1的末尾再加上0用来保证下面循环的正常运行；
    char temp[1024][30] = { '\0' };//用来存储粗略划分标识符的结果（处理完的结果包含五种，未进行筛选）
    int i = 0, j = 0, num = 0, num1 = 0;
    int flag = 0, flag1 = 0;

    char op[8] = { '+', '-', '*', '/', '=', '>', '<','!' };

    //粗略建立标识符数组
    while (count >= 0) {
        if (space1[i] != ' ' && space1[i] != '\n') {
            //首先判断是否为关键字和标识符和数字
            if (isalpha(space1[i]) || isdigit(space1[i]) || (space1[i] == '_') || space1[i] == '\0') {
                if ((q1.size() != '\0') && ((space1[i] == '\0') || flag == 1)) {//前半部分主要保证最后一次有效读入的队列中还有值但space[i]中的i值已指向无效空值时；其余都使用flag作为判断条件
                    ProcessQueueItems(q1, temp, j, num1, num); // 调用外置函数
                }
                else {
                    q1.push(space1[i]);//将值入队
                    flag = 0;
                    num++;
                    i++;
                    count--;
                }
            }
            //是否为分隔符,是的话进入
            else  if (find(begin(delimiters), end(delimiters), space1[i]) != end(delimiters)) {//这个就是比对一维数组delimiters与当前字符是否有相同的值，有的话进入
                if (q1.size() != '\0') {//此处再一次判断是为了保证遇到分隔符之后队列中有值能够先存入temp,因为下列i值会增加，导致队列还未出队，便已经开始存入新值
                    ProcessQueueItems(q1, temp, j, num1, num); // 调用外置函数
                }
                temp[j++][0] = space1[i];
                i++;
                count--;
                flag = 1;
            }
            //是否为运算符，是的话进入
            else  if (find(begin(op), end(op), space1[i]) != end(op)) {
                if (q1.size() != '\0') {
                    ProcessQueueItems(q1, temp, j, num1, num); // 调用外置函数
                }
                //下列if为双值运算符 ">=" "<=" "==" "!=" 之后若有新运算符需添加，修改下列判断条件即可
                if ((space1[i] == '>' && space1[i + 1] == '=') || (space1[i] == '<' && space1[i + 1] == '=') || (space1[i] == '=' && space1[i + 1] == '=') || (space1[i] == '!' && space1[i + 1] == '=')) {
                    temp[j][0] = space1[i++];
                    temp[j++][1] = space1[i++];
                    count -= 2;//一次存入两个字符所以减2
                }
                else {
                    //此为单个运算符
                    temp[j++][0] = space1[i++];
                    count--;
                }
                flag = 1;
            }
            else if (space1[i] == '"'|| space1[i] == '\'') {//遇见单双引号识别为字符串常量，直到遇见下一个相对应的单双引号跳出
                while (1) {
                    count--;
                    i++;
                    if (space1[i] == '"' || space1[i] == '\'') {
                        break;
                    }
                }
                i++;
                count--;
            }
            else  {//不属于上述的要求的字符值
                i++;
                count--;
                if (q1.size() != '\0') {
                    ProcessQueueItems(q1, temp, j, num1, num); // 调用外置函数
                }
            }
        }
        else {
            i++;
            count--;
            flag = 1;
        }
    }

    int num2 = j;//num2:为了在下述循环中得到循环次数
    //判断是否为合法字符串
    //去除关键字
    int count1 = 0;//计数
    for (int j = 0;j < num2;j++) {
        for (int i = 0;i < 4;i++)
        {
            if (strcmp(keywords[i], temp[j]) == 0)//当遇到相同关键字时，直接跳出当前循环导致count1<4
            {
                break;
            }
            count1++;
        }
        //去除数字和长度不符的标识符
        if (count1 == 4 && (strlen(temp[j]) <= 15 && (isalpha(temp[j][0]) || (temp[j][0] == '_'))))//只有count1=4且temp中字符串的大小小于15且temp的首字符符合判断要求
        {
            cout << temp[j] << endl;
            fout1 << temp[j] << endl;

        }
        count1 = 0;
    }
    fout1.close();
    Lexical_analysis(temp);//进行词法分析，传入的是未进行详细划分的数组（5种类型都包含在内）
}

//词法分析
void Lexical_analysis(char (&temp)[1024][30]) {
    ofstream fout2("output_binary.txt");//此文件为第二个输出文件（主要保存输出的合法二元组）
    char op[8] = { '+', '-', '*', '/', '=', '>', '<','!' };
    string temp1[1024]; // 假设字符串数组的大小为1024
    for (int i = 0; i < 1024; i++) {
        temp1[i] = temp[i];
    }
    int i = 0, number = 1;
    int a_count = 0;
    while (temp[i][0]!='\0') {   
        switch (number) {
            //判断为关键字
        case 1:
            for (int j = 0;j < 4;j++)
            {
                if (strcmp(keywords[j], temp[i]) == 0)//当遇到相同关键字时，直接跳出当前循环
                {
                    a_struct[a_count].type = 1;//type:1 
                    //strcpy_s(a_struct[a_count++].array, temp[i++]);//这个是char类型之间的 
                    a_struct[a_count++].array = temp1[i++];//只有当识别到为关键字时，i++,保证case5不会重复读入
                    break;//因为此break只跳出的是for循环，仍会逐一进行case2/3/4/5,当在case5时，若没有上面的i++，则仍会再次存入相同的关键字！
                }
            }
            //判断为运算符
        case 2:
            if (find(begin(op), end(op), temp[i][0]) != end(op)) {
                a_struct[a_count].type = 2;//type:2
                //strcpy_s(a_struct[a_count++].array, temp[i]);
                a_struct[a_count++].array = temp1[i];
                break;
            }
           
            //判断为分隔符
        case 3:
            if (find(begin(delimiters), end(delimiters), temp[i][0]) != end(delimiters)) {
                a_struct[a_count].type = 3;//type:3
                //strcpy_s(a_struct[a_count++].array, temp[i]);
                a_struct[a_count++].array = temp1[i];
                break;
            }
            //判断为数字
        case 4:
            if (isdigit(temp[i][0])) {
                a_struct[a_count].type = 4;//type:4
                //strcpy_s(a_struct[a_count++].array, temp[i]);
                a_struct[a_count++].array = temp1[i];
                break;
            }
            //判断为标识符
        case 5:
            a_struct[a_count].type = 5;//type:5
            //strcpy_s(a_struct[a_count++].array, temp[i]);
            a_struct[a_count++].array = temp1[i];
            break;
        }
        i++;
    }
    for (int i =0;i < a_count;i++) {
        fout2 << "( " << a_struct[i].type << " , " << a_struct[i].array << " )"<<endl;
        cout << "( " << a_struct[i].type << " , " << a_struct[i].array << " )"<<endl;
    }
}