编译原理:第七节 及词法分析器的C++和Python实现

编译原理:词法分析

概述:

词法分析是完成编译程序的第一个阶段的工作。所谓词法分析就是对输入字符串形式的源程序按顺序进行扫描,识别其中的字符串作为输出。词法分析器的作用
就是完成这个阶段。词法分析器的是所有编译器所必须的。例如:

这是Python的架构,我们可以看到Scanner,它所做的就是进行词法分析。
举个简单的C/C++的例子,来直观的看一下词法分析器的功能:

我们看到词法分析器识别出了关键字,标识符,整数和一些特殊的符号,并以二元组的形式输出。我们使用的编译器的词法分析和这个原理上式一致的,只是复杂了很多。

词法分析器的实现:

首先,构造识别单词的状态转换图
然后,编程实现状态转换图

例如我们用DFA来表示可识别的单词:

我们把它们进行合并,如图:

利用此图我们就可以识别标识符,无符号整数,分界符,运算符。接下来我们就可以编程来实现。其实本质来说我们就是利用状态转移,我们可以规定开始时为0状态(即图中的S状态)读入一个字符,如果是字母,则状态变为1,是数字则状态变为2........出口定义为结束状态,可以用一个整数值来表示,若状态为终结状态则输出。上图部分用代码表示:
GETNEXTCHAR( ) ; 
SWITCH(CHCODE);
{
CASE   1: { WHILE  (ISLETTER OR ISDIGIT) DO
                                               { 
                                                     SAVE( ); //  当前字符放入一临时字符数组; 
                                                    GETNEXTCHAR( ) ;//从缓冲区取下一字符
                                                  };
                                    UNGETCH;//回退一字符
                                    OUTPUT(1,标识符名字);
                               };BREAK;
CASE   2: { WHILE ISDIGIT DO
                                                      { 
                                                      SAVE( ); //  当前字符放入一临时字符数组;
                                                       GETNEXTCHAR ;//从缓冲区取下一字符
                                                         };
                                     UNGETCH;//回退一字符
                                     OUTPUT (2, 整数);
                            }; BREAK;  

下面给出一个词法分析器的完整代码:
#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>
#include<memory.h>
#include<string.h>
#define MAXSIZE 100//缓冲区大小
#define RESWORD 9
int state = 0;//DFA状态
int index = 0;//用于记录识别文法的长度
char ch;//每次读取的字符
char chserious[MAXSIZE];//用于记录识别的文法(缓冲区)
long ll;//文件读取位置
int line = 0;//行号
int error_count = 0;//不可识别字符的个数
int annotate_count = 0;//注释的个数
char ResWord[][10] = {"int","if","then","else","end","repeat","until","read","write"};//保留字
FILE *file = fopen("N:\\wang.txt","rt+");
struct Error_message//记录不可识别字符信息
{
	int line_number;
	char error_char;
};
struct Annotate_message//记录注释信息
{
	int line_number;
	char annotate[MAXSIZE];
};
bool CheckRes(char *str)//检查是否为保留字
{
	for(int i=0;i<RESWORD;i++)
	{
		if(strcmp(str,ResWord[i])==0)
			return true;
	}
	return false;	
}
struct Error_message message[MAXSIZE];
struct Annotate_message annotateMessage[MAXSIZE];
int main()
{	
	while(!feof(file))
	{
		ch = fgetc(file);
		if(ch == '\n')
			line++;
		switch(state)
		{
			case 0:
				index = 0;
				chserious[0] = ch;
				if(isalpha(ch))
					state = 1;
				else if(isdigit(ch))
					state = 3;
				else if(ch == '+')
					state = 5;
				else if(ch == '-')
					state = 9;
				else if(ch == '*')
					state = 13;
				else if(ch == '/')
					state = 16;
				else if(ch == '=')
					state = 20;
				else if(ch == '<')
					state = 21;
				else if(ch == '{')
					state = 22;
				else if(ch == '}')
					state = 23;
				else if(ch == ';')
					state = 24;
				else if(ch != EOF)
				{	
					if(!isspace(ch)&&(ch!='\n'))
					{
						message[error_count].line_number = line/2;
						message[error_count].error_char = ch;
						error_count++;
					}
					state = 25;
				}
				break;

			case 1:
				while(isalpha(ch)||isdigit(ch))
				{
					index ++;
					chserious[index] = ch;
					ch = fgetc(file);
				}			
				ll = ftell(file);
				if(ch == '\n')
					line++;
				if(ch == EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				else
					fseek(file,ll-2L, SEEK_SET);//回退
				state = 2;
				break;
			case 2:
				if(CheckRes(chserious))
					printf("(关键字,%s)\n",chserious);
				else
					printf("(标识符,%s)\n",chserious);
				state = 0;
				index = 0;
				memset(chserious,'\0',sizeof(chserious));
				break;
			case 3:
				while(isdigit(ch))
				{
					index++;
					chserious[index] = ch;
					ch = fgetc(file);
				}
				ll = ftell(file);
				if(ch == EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				else
					fseek(file,ll-2L, SEEK_SET);//回退
				state = 4;
				break;
			case 4:
				if(ch == '\n')
					line++;
				printf("(数,%s)\n",chserious);
				state = 0;
				index = 0;
				memset(chserious,'\0',sizeof(chserious));
				break;
			case 5:
				if(ch == '+')
					state = 6;
				else if(ch == '=')
					state = 7;
				else
					state = 8;
				
				break;
			case 6:
				printf("(特殊符号,++)\n");
				ll = ftell(file);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				state = 0;
				break;
			case 7:
				printf("(特殊符号,+=)\n");
				ll = ftell(file);
				//printf("%ld",ll);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
			
				state = 0;
				break;
			case 8:
				printf("(特殊符号,+)\n");
				ll = ftell(file);
				fseek(file,ll-2L, SEEK_SET);//回退
				state = 0;
				break;
			case 9:
				if(ch == '-')
					state = 10;
				else if(ch == '=')
					state = 11;
				else
					state = 12;
				break;
			case 10:
				printf("(特殊符号,--)\n");
				ll = ftell(file);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				state = 0;
				break;
			case 11:
				printf("(特殊符号,-=)\n");
				ll = ftell(file);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				state = 0;
				break;
			case 12:
				printf("(特殊符号,-)\n");
				ll = ftell(file);
				fseek(file,ll-2L, SEEK_SET);//回退
				state = 0;
				break;
			case 13:
				if(ch == '=')
					state = 14;
				else
					state = 15;
				break;
			case 14:
				printf("(特殊符号,*=)\n");
				ll = ftell(file);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				state = 0;
				break;
			case 15:
				printf("(特殊符号,*)\n");
				ll = ftell(file);
				fseek(file,ll-2L, SEEK_SET);//回退
				state = 0;
				break;
			case 16:
				if(ch == '/')
					state = 17;
				else if(ch == '=')
					state = 18;
				else
					state = 19;
				break;

			case 17:
				{
					printf("(特殊符号,//)\n");
					ll = ftell(file);
					fseek(file,ll-1L, SEEK_SET);//回退
					state = 0;

					ch = fgetc(file);
					//printf("line:%d\n",line/2);
					annotateMessage[annotate_count].line_number = line/2;
					int j = 0;
					while(ch!='\n'&&ch!=EOF)
					{
						//printf("---------%c******\n",ch);
						annotateMessage[annotate_count].annotate[j] = ch;
						ch = fgetc(file);
						j++;
					}
					line+=2;
					annotate_count++;
					//exit(0);

					break;
				}
			case 18:
				printf("(特殊符号,/=)\n");
				ll = ftell(file);
				if(ch!=EOF)
					fseek(file,ll-1L, SEEK_SET);//回退
				state = 0;
				break;
			case 19:
				printf("(特殊符号,/)\n");
				ll = ftell(file);
				fseek(file,ll-2L, SEEK_SET);//回退
				state = 0;
				break;
			case 20:
				printf("(特殊符号,=)\n");
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
				break;
			case 21:
				printf("(特殊符号,<)\n");
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
				break;
			case 22:
				printf("(特殊符号,{)\n");
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
				break;
			case 23:
				printf("(特殊符号,})\n");
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
				break;
			case 24:
				printf("(特殊符号,;)\n");
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
				break;
			case 25:
				if(isspace(ch))
					printf("(特殊符号,空格)\n");
				//printf("****%c****\n",ch);
				if(ch != EOF)
				{
				    ll = ftell(file);
				    fseek(file,ll-1L, SEEK_SET);//回退
				}
				state = 0;
			default:
				break;
		}
	}
	fclose(file);
	//printf("%d\n",state);
	//printf("%s\n",chserious);
	printf("注释内容的个数为:%d 分别为:\n",annotate_count);
	for(int j=0;j<annotate_count;j++)
	{
		printf("(%d,%s)\n",annotateMessage[j].line_number,annotateMessage[j].annotate);
	}
	printf("不可识别的字符个数为:%d 分别为:\n",error_count);
	for(int i=0;i<error_count;i++)
	{
		printf("(%d,%c)\n",message[i].line_number,message[i].error_char);
	}	
	return 0;
}

下面我用Python重写了这个词法分析器,更加简洁,每次读入文件一行进行处理(上面C++版本每次读入一个字符),代码如下:

# -*- coding: cp936 -*-
'''
DFA有限自动机Python实现
作者:王灿
2015-9-27于中国矿业大学
'''
class DFA:
    file_object = ''#文件句柄
    line_number = 0 #记录行号
    state = 0 #状态
    ResWord = ['int','if','then','else','end','repeat','until','read','write']#保留字
    error_message = []#保存错误信息,存储元组,元组第一个参数是行号,第二个参数是错误字符
    annotate_message = []#注释信息,存储元组,元组第一个参数是行号,第二个参数是注释
    char_message = []#识别的字符串,存储元组,元组第一个参数是类型,第二个参数是该字符串

    def __init__(self,file_name):
        self.file_object = file_name
        self.state = 0
        self.line_number = 0
        self.error_message = []
        self.annotate_message = []
        self.char_message = []
    def Start_convert(self):
        for line in self.file_object:#一行行的处理
            line = line.strip('\n')#去除换行fu
            self.line_number += 1#没处理一行行号加一
            line_length = len(line)
            i = 0
            string = ''#存储一个字符串
            while i < line_length:
                ch = line[i]#读取该行的一个字符
                i += 1
                if self.state == 0:#初始状态
                    string = ch
                    if ch.isalpha():
                        self.state = 1
                    elif ch.isdigit():
                        self.state = 3
                    elif ch == '+':
                        self.state = 5
                    elif ch == '-':
                        self.state = 9
                    elif ch == '*':
                        self.state = 13
                    elif ch == '/':
                        self.state = 16
                    elif ch == '=':
                        self.state = 20
                        i -= 1
                    elif ch == '<':
                        self.state = 21
                        i -= 1
                    elif ch == '{':
                        self.state = 22
                        i -= 1
                    elif ch == '}':
                        self.state = 23
                        i -= 1
                    elif ch == ';':
                        i -= 1
                        self.state = 24
                    elif ch.isspace():
                        self.state = 25
                    else:
                        self.state = 26#不可识别状态
                        i -= 1
                elif self.state == 1:#判断字母数字
                    while ch.isalpha() or ch.isdigit():
                        string += ch                       
                        if i < line_length:
                            ch = line[i]
                            i += 1
                        else:
                            break
                    self.state = 2
                    i -= 2#回退2个字符
                elif self.state == 2:
                    if string in self.ResWord:
                        content = '(关键字,' + string + ')'                    
                    else:
                        content = '(标识符,' + string + ')'
                    #print content
                    self.char_message.append(content)
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 3:
                    while ch.isdigit():
                        string += ch                       
                        if i < line_length:
                            ch = line[i]
                            i += 1
                        else:
                            break
                    self.state = 4
                    i -= 2#回退2个字符
                elif self.state == 4:
                    content = '(数字,' + string + ')'
                    self.char_message.append(content)
                    #print string
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 5:
                    if ch == '+':
                        self.state = 6
                        i -= 1
                    elif ch == '=':
                        self.state = 7
                        i -= 1
                    else:
                        self.state = 8
                        i -= 2
                elif self.state == 6:#判断++
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 7:#判断+=
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 8:#判断+
                    content = '(特殊符号,' + ch + ')'
                    self.char_message.append(content)
                    #print ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 9:
                    if ch == '-':
                        self.state = 10
                        i -= 1
                    elif ch == '=':
                        self.state = 11
                        i -= 1
                    else:
                        self.state = 12
                        i -= 2
                elif self.state == 10:
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch#判断--
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 11:#判断-=
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 12:#判断-
                    content = '(特殊符号,' + ch + ')'
                    self.char_message.append(content)
                    #print ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 13:
                    if ch == '=':
                        self.state = 14
                        i -= 1
                    else:
                        self.state = 15
                        i -= 2
                elif self.state == 14:#判断*=
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 15:#判断*
                    content = '(特殊符号,' + ch + ')'
                    self.char_message.append(content)
                    #print ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 16:
                    if ch == '/':
                        self.state = 17
                        i -= 1
                    elif ch == '=':
                        self.state = 18
                        i -= 1
                    else:
                        self.state = 19
                        i -= 2
                elif self.state == 17:#判断//
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    content = '(注释,'+ line[i:] +')'
                    self.annotate_message.append(content)
                    #print content          
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 18:#判断/=
                    content = '(特殊符号,' + string + ch + ')'
                    self.char_message.append(content)
                    #print string + ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 19:#判断/
                    content = '(特殊符号,' + ch + ')'
                    self.char_message.append(content)
                    #print ch
                    string = ''#回到初始情况
                    self.state = 0#回到状态0
                elif self.state == 20:
                    content = '(特殊符号,=)'
                    self.char_message.append(content)
                    #print '='
                    self.state = 0
                    string = ''
                elif self.state == 21:
                    content = '(特殊符号,<)'
                    self.char_message.append(content)
                    #print '<'
                    self.state = 0
                    string = ''
                elif self.state == 22:
                    content = '(特殊符号,{)'
                    self.char_message.append(content)
                    #print '{'
                    self.state = 0
                    string = ''
                elif self.state == 23:
                    content = '(特殊符号,})'
                    self.char_message.append(content)
                    #print '}'
                    self.state = 0
                    string = ''
                elif self.state == 24:
                    content = '(特殊符号,;)'
                    self.char_message.append(content)
                    #print ';'
                    self.state = 0
                    string = ''
                elif self.state == 25:
                    while ch.isspace():
                        if i < line_length:
                            ch = line[i]
                            i += 1
                        else:
                            break
                    self.state = 0
                    i -= 1
                elif self.state == 26:
                    content = '(行号:'+str(self.line_number)+',' + ch + ')'
                    self.error_message.append(content)
                    #print 'error:' + ch
                    self.state = 0
                    string = ''
        #print self.state
    def Get_error(self):#获取错误信息
        return self.error_message

    def Get_annotate(self):#获取注释信息
        return self.annotate_message

    def Get_char(self):#获取识别信息
        return self.char_message
'''
*****************测试内容************************
try:
    file_object = open("N:\\wang4.txt")
    dfa = DFA(file_object)
    dfa.Start_convert()
    content = dfa.Get_char()
    for item in content:
        print item
    content = dfa.Get_annotate()
    for item in content:
        print item
    content = dfa.Get_error()
    for item in content:
        print item
finally:  
    file_object.close()  
'''

利用面向对象的机制,我们很容易做到数据,数据处理和GUI端的分离,便于代码复用和重构,把上面Python代码做成一个简单的GUI如下:


  • 5
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值