编译原理——词法分析

最新推荐文章于 2024-05-31 08:41:37 发布

RE9T

最新推荐文章于 2024-05-31 08:41:37 发布

阅读量788

点赞数 1

分类专栏：编译原理文章标签： python c++

本文链接：https://blog.csdn.net/weixin_43323747/article/details/108533272

版权

编译原理专栏收录该内容

2 篇文章 1 订阅

订阅专栏

编译原理实验——词法分析

写出某个语言的词法分析程序，要求能识别出关键字，标识符，常数，运算符和分界符等常用语言单词符号

简单来说：词法分析就是能将一个程序每个单词符号识别出来，然后标记上特定的标识码。

输入：程序文本
输出：二元序列（单词，种别码）

本次识别语言种别码如下

单词符号	种别码	单词符号	种别码
int	1	=	11
long	2	if	12
start	3	then	13
end	4	endif	13
ID	5	,	15
NUM	6	;	16
+	7	==	17
-	8	<	18
*	9	>	19
/	10	error	0

词法分析流程图和状态转换图略，词法分析实际上只要能识别出单词符号然后标记上对应的种别码就行。
首先是待分析程序 test.txt文件

start 
	int a,b,c,d,e;
	a = 3; b = 6;
	if b<c then c=c-3;endif;
end

词法分析程序如下：
C++版

#include <iostream>
#include <fstream>
#include <string>
#include<algorithm>
#include <map>


using namespace std;
map<string,int> key_Iden;

void init()
{
    /*初始化*/
    key_Iden["int"]=1;
    key_Iden["long"]=2;
    key_Iden["start"]=3;
    key_Iden["end"]=4;
    key_Iden["ID"]=5;
    key_Iden["NUM"]=6;
    key_Iden["+"]=7;
    key_Iden["-"]=8;
    key_Iden["*"]=9;
    key_Iden["/"]=10;
    key_Iden["="]=11;
    key_Iden["if"]=12;
    key_Iden["then"]=13;
    key_Iden["endif"]=14;
    key_Iden[","]=15;
    key_Iden[";"]=16;
    key_Iden["=="]=17;
    key_Iden["<"]=18;
    key_Iden[">"]=19;

}

void print(string sflag,string st)
{
    cout<<"("<<key_Iden[sflag]<<"  "<<st<<")"<<endl;
}

void scanner(string line)
{
    init();
    cout<<line<<endl;
    int p=0,endp = line.length(),l=0;
    char s[10]="",ch;
    while(p<endp)
    {
        ch=line[p];l=0;
        int iflag = 0;
        while((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch>='0'&&ch<='9'))
        {
            //合法可用字符，读取完整
            iflag=1;
            s[l++]=ch;
            p++;
            ch=line[p];
        }
        if(iflag) //检查是否为标识符或
        {
            s[l]='\0';
            string st=s;
            if(key_Iden[st]==0)
            {
                //非关键字
                string sflag;
                if(s[0]>='a'&&s[0]<='z')
                {
                    print("ID",st);
                }
                else
                {
                    int flag=1;
                    for(int i=0; i<l&&flag; i++)
                    {
                        if(s[i]<'0'||s[i]>'9')
                            flag=0;
                    }
                    if(flag)
                        print("NUM",st);
                    else
                        print("error",st);
                }

            }
            else
            {
                //关键字
                print(st,st);
            }
        }
        if(ch!=' ')
        {
            string str[2];int i=-1;
            str[0]=line.substr(p,1);
            str[1]=line.substr(p,2);
            if(key_Iden[str[1]]){
                i=1;p+=2;
            }else{
                if(key_Iden[str[0]])
                    i=0;
                p++;
            }
            if(i!=-1)
                print(str[i],str[i]);
            else if(p<endp)
                print("error", str[0]);
        }else{
            p++;
        }
    }
}

string  Replace(string line)//预处理，将无用标识替换为空格
{
    replace(line.begin(),line.end(),'\t',' ');
    return line;
}

int main()
{

    ifstream infile("test.txt");
    string line="";
    string t;
    if(infile.is_open())
    {
        while(getline(infile,t))
        {
            line += t;
        }
        line=Replace(line);
        scanner(line);
    }
    else
    {
        cout<<"文件打开失败"<<endl;
    }
    return 0;
}

c++版的就需要我们自己一个个读取字符，然后凑对识别，代码逻辑主要在scanner函数那里。

python版

import os
import re

key_Iden = {
    'int': 1, 'long': 2, 'start': 3, 'end': 4,
    'ID': 5, 'NUM': 6, 'if': 12, 'then': 13, 'endif': 14, 'error': 0
}

key_sym = {
    '+': 7, '-': 8, '*': 9, '==': 17,
    '/': 10, '=': 11, ',': 15, ';': 16, '<': 18, '>': 19,
}
sym_id = []  # 词法分析结果集

def init(file):
    """
    初始化，符号替换,分割
    """
    # 回车换行tap替换
    s = file.replace('\r', ' ')
    s = s.replace("\t", ' ')
    s = s.replace('\n', ' ')
    # 运算符和边界替换
    for i in [i for i in list(key_sym.keys()) if i != '=']:
        s = s.replace(i, ' '+i+' ')
    st = s.split(' ')
    st = [i for i in st if i != '']
    new_st = []
    for i in st:
        if i != '==' and '=' in i:
            new_st += re.split(r'(=)', i)
        else:
            new_st.append(i)
    return [i for i in new_st if i != '']


def scanner(file):
    st = init(file)
    for i in st:
        if i in key_Iden:
            sym_id.append([i, key_Iden[i]])
        elif i in key_sym:
            sym_id.append([i, key_sym[i]])
        else:
            if i.isdigit():
                sym_id.append([i, key_Iden['NUM']])
            elif i.isalnum() and i[0].isalpha():
                sym_id.append([i, key_Iden['ID']])
            else:
                sym_id.append([i, key_Iden['error']])

if __name__ == '__main__':
	 with open("test.txt", encoding='utf-8') as f:
        print("待分析程序")
        scanner(f.read())
        print("< %5s|%3s >"%("符号","内码值"))
        for i in sym_id:
            print("< %6s | %2s >"%(i[0],i[1]))