编译原理词法分析器——C语言实现

词法分析器

(一)基本内容
实现一个可以识别C语言(子集)词法的程序,识别源代码中的每个词,并且将结果保存下来。
可识别的词包括
头文件:以#include开头的过滤为头文件
保留字:“auto”, “break”, “case”, “char”, “const”, “continue”,
“default”, “do”, “double”, “else”, “enum”, “extern”,“float”,
“for”, “goto”, “if”, “int”, “long”,“register”,“return”,
“short”, “signed”, “sizeof”, “static”,“struct”,“switch”,
“typedef”, “union”, “unsigned”, “void”,“volatile”,“while”

界符: “+”, “-”, “*”, “/”, “<”, “<=”, “>”, “>=”, “=”, “==”,
“!=”, “;”, “(”, “)”, “^”, “,”, “”", “’”, “#”, “&”,
“&&”, “|”, “||”, “%”, “~”, “<<”, “>>”, “[”, “]”, “{”,
“}”, “\”, “.”, “?”, “:”, “!”

标识符:未出现在保留字中且符合标识符格式的记为标识符
(二)算法思路
将txt格式的源代码读入数组,
经过过滤函数过滤(应该为替换头文件,为了简化将头文件过滤掉)掉头文件,同时在屏幕打印出这个是头文件,过滤掉换行符制表符,过滤掉单行注释和多行注释,将过滤后的程序覆盖到之前的数组,并在最后加入结束符号,
进行扫描,在扫描过程中过滤空格,并通过事先建立好的DFA,来判定将要加入临时数组中的词,到不符合DFA的字符时则停止,此时返回和该临时数组相符的种别码,如:标识符种别码为100,保留字种别码为1-32等等,
通过种别码判断,如果是标识符,则在事先建立好的标识符数组中查找是否已经添加过该标识符,如果未添加过则在上一个标识符之后加入该标识符,同时打印到屏幕上,并且输出入新的目标文件。
(三)最终结果
源代码文件:
在这里插入图片描述
屏幕打印结果:
在这里插入图片描述
输出文件:
在这里插入图片描述
(四)备注
由于该编译器只有词法分析器部分,所以输出的二元组并未完全按照词法分析器的规则,为了便于查看输出状况用以调试,所以输出如上图,将标识符等类作为二元组的第一个值,将其本身作为第二个值输出。
(五)程序代码

#include<stdio.h>
#include<string.h>
#include<stdlib.h>

int t = -1;//种别码
int pProject = 0;//源程序指针

//保留字
static char reserveWord[32][10] =
{
	"auto", "break", "case", "char", "const", "continue",
	"default", "do", "double", "else", "enum", "extern","float",
	"for", "goto", "if", "int", "long","register","return",
	"short", "signed", "sizeof", "static","struct","switch",
	"typedef", "union", "unsigned", "void","volatile","while"
};


//界符~运算符 
static char boredrWord[36][10] =
{
    "+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
	"!=", ";", "(", ")", "^", ",", "\"", "\'", "#", "&",
	"&&", "|", "||", "%", "~", "<<", ">>", "[", "]", "{",
	"}", "\\", ".", "\?", ":", "!"
};

//标识符
static  char identifier[1000][50] = {""};


//过滤源程序函数
int filtrationResource( char resourceProject[] , int end )
{
	char temp[10000];//暂存过滤后的程序 
	int count = 0;//过滤后的源程序指针 
	int i;
	for( i = 0 ; i < end ; i++ )
	{
		//过滤头文件 
		if ( resourceProject[i] == '#' && resourceProject[i + 1] == 'i' && resourceProject[i + 2] == 'n' && resourceProject[i + 3] == 'c' 
		&& resourceProject[i + 4] == 'l' && resourceProject[i + 5] == 'u' && resourceProject[i + 6] == 'd' && resourceProject[i + 7] == 'e' )
		{
			printf("(  头文件  ,  ");
			while( resourceProject[i] !='<')
			{
				i++;
			}
			while( resourceProject[i] != '\n' )
			{
				printf("%c",resourceProject[i++]);
			}
			printf("  )\n");
		} 
		 

		//删除单行注释
		if ( resourceProject[i] == '/' && resourceProject[i + 1] == '/')
		{
			while( resourceProject[i] != '\n' )
			{
				i++;
			}
		}

		//删除多行注释
		if ( resourceProject[i] == '/' && resourceProject[i + 1] == '*')
		{
			i = i + 2;
			while( resourceProject[i] != '*' && resourceProject[i + 1] != '/')
			{
				i++;
				if( i == end )
				{
					break;
				}
			}
			i = i + 2;
		}

		//删除换行符 制表符
		if ( resourceProject[i] != '\n' && resourceProject[i] != '\t')
		{
			temp[count] = resourceProject[i];
			count++;
		}
	}
	temp[count] ='$';//末位增加结束符 
	strcpy( resourceProject , temp );
	return count;//更新当前源程序数组的末尾指针 
}

//判断是否为数字
int IsDigit(char digit)
{
	if (digit >= '0'&&digit <= '9')
	{
		return 1;
	}
	else
	{
		return 0;
	}
}


//判断是否为字母
int IsLetter(char letter)
{//注意C语言允许下划线也为标识符的一部分可以放在首部或其他地方
	if ( (letter >= 'a'&&letter <= 'z') || (letter >= 'A'&&letter <= 'Z') || letter=='_')
	{
		return 1;
    }
    else
    {
       return 0;
    }
}

//查找保留字
int searchReserve(char reserveWord[][10], char s[])
{
	int i;
	for ( i = 0; i < 32; i++ )
	{
		if (strcmp(reserveWord[i], s) == 0)
		{	//若成功查找,则返回种别码
			i++;
			return i;//返回种别码
		}
	}
	return -1;

}

int Scaning( char resourceProject[] , char token[] )
{
	int i, count = 0;//count用来做token[]的指示器,收集有用字符
    char ch;//作为判断使用
    ch = resourceProject[pProject];
 
    while (ch ==' ')
    {//过滤空格,防止程序因识别不了空格而结束
        pProject++;
        ch = resourceProject[pProject];
    }
    for (i = 0; i<20; i++)
    {//每次收集前先清零
        token[i] = '\0';
   
    }

    if (IsLetter(resourceProject[pProject]))
    {//开头为字母
		token[count] = resourceProject[pProject];//收集
		count++;
		pProject++;//下移
		while (IsLetter(resourceProject[pProject]) || IsDigit(resourceProject[pProject]))
		{//后跟字母或数字
			token[count++] = resourceProject[pProject];//收集
			pProject++;//下移
		}//多读了一个字符既是下次将要开始的指针位置
		
		token[count] = '\0';
		t =  searchReserve(reserveWord, token);//查表找到种别码
		if (t == -1)
		{//若不是保留字则是标识符
			 t = 100;//标识符种别码
		}	
		return t;
	}
	else if (IsDigit(resourceProject[pProject]))
	{//首字符为数字
		while (IsDigit(resourceProject[pProject]))
		{//后跟数字
			token[count] = resourceProject[pProject];//收集
			count++;
			pProject++;
		}//多读了一个字符既是下次将要开始的指针位置
		token[count] = '\0';
		return t;
	}
	else if (ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == ';' || ch == '(' || ch == ')' || ch == '^'
				|| ch == ',' || ch == '\"' || ch == '\'' || ch == '~' || ch == '#' || ch == '%' || ch == '['
				|| ch == ']' || ch == '{' || ch == '}' || ch == '\\' || ch == '.' || ch == '\?' || ch == ':')
	{//若为运算符或者界符,查表得到结果
		token[0] = resourceProject[pProject];
		token[1] = '\0';//形成单字符串
		for (i = 0; i<36; i++)
        	{//查运算符界符表
            	if (strcmp(token, boredrWord[i]) == 0)
            	{
            	    t = 33 + i;//获得种别码,使用了一点技巧,使之呈线性映射
            	    break;//查到即推出
            	}
        	}
        pProject++;//指针下移,为下一扫描做准备
        return t;
    }
    else  if (resourceProject[pProject] == '<')
    {//		<,<=,<<
        pProject++;//后移,超前搜索
         if (resourceProject[pProject] == '=')
        {
            t = 38;
        }
        else if (resourceProject[pProject] == '<')
        {//左移
            pProject--;
            t = 58;
        }
        else
        {
            pProject--;
            t = 37;
        }
        pProject++;//指针下移
        return t;
    }
    else  if (resourceProject[pProject] == '>')
    {//		>,>=,>>
        pProject++;
        if (resourceProject[pProject] == '=')
        {
            t = 40;
        }
        else if (resourceProject[pProject] == '>')
        {
            t = 59;
        }
        else
        {
            pProject--;
            t = 39;
        }
        pProject++;
        return t;
    }
    else  if (resourceProject[pProject] == '=')
    {//		=,==
        pProject++;
        if (resourceProject[pProject] == '=')
        {
            t = 42;
        }
        else
        {
            pProject--;
            t = 41;
        }
        pProject++;
        return t;
    }
    else  if (resourceProject[pProject] == '!')
    {//		!,!=
        pProject++;
        if (resourceProject[pProject] == '=')
        {
            t = 43;
        }
        else
        {
            t = 68;
            pProject--;
        }
        pProject++;
        return t;
    }
    else  if (resourceProject[pProject] == '&')
    {//		&,&&
        pProject++;
        if (resourceProject[pProject] == '&')
        {
            t = 53;
        }
        else
        {
            pProject--;
            t = 52;
        }
        pProject++;
        return t;
    }
    else  if (resourceProject[pProject] == '|')
    {//		|,||
        pProject++;
        if (resourceProject[pProject] == '|')
        {
            t = 55;
        }
        else
        {
            pProject--;
            t = 54;
        }
        pProject++;
        return t;
    }
    else  if (resourceProject[pProject] == '$')
    {//结束符
        t = 0;//种别码为0
        return t;
    }
    else
    {//不能被以上词法分析识别,则出错。
        printf("error:there is no exist %c00 \n", ch);
        exit(0);
    }
}
int main()
{
	//读取源程序
	char resourceProject[10000];
	char token[20] = {0};
	int i;
	int end = 0;//数组末尾下标
	FILE *fp;
	if ( ( fp = fopen ( "E:\\resourceProject.txt", "r" ) ) == NULL )
	{
		printf("未找到源程序文档!");
		return 0;
	}

	//将源程序读入resourceProject数组
	resourceProject[ pProject ] = fgetc(fp);
	while(resourceProject[ pProject ] != EOF )
	{
		pProject++;
		resourceProject[ pProject ] = fgetc(fp);
	}
	fclose(fp);
	end = filtrationResource( resourceProject , pProject );
	pProject = 0;
	if ( ( fp = fopen ( "E:\\targetProject.txt", "w" ) ) == NULL )
	{
		printf("未找到输出文档!");
		return 0;
	}
	while (t != 0)
    {
        //启动扫描
        t = Scaning(resourceProject, token);
        if (t == 100)
        {
            for (i = 0; i<1000; i++)
            {//插入标识符表中
                if (strcmp(identifier[i], token) == 0)
                {//已在表中
                	break;
                }
                if (strcmp(identifier[i], "") == 0)
                {//查找空间
                    strcpy(identifier[i], token);
                    break;
                }
            }
            printf("(  标识符  ,  %s  )\n", token);
            fprintf(fp,"(  标识符  ,  %s  )\n", token);
        }
        else if (t >= 1 && t <= 32)
        {
            printf("(  保留字  ,  %s  )\n", reserveWord[t - 1]);
        	fprintf(fp,"(  保留字  ,  %s  )\n", reserveWord[t - 1]);
		}
        else if (t == 99)
        {
            printf("(  常数  ,  %s  )\n", token);
            fprintf(fp,"(  常数  ,  %s  )\n", token);
        }
        else if (t >= 33 && t <= 68)
        {
            printf("(  界符  ,  %s  )\n", boredrWord[t - 33]);
        	fprintf(fp,"(  界符  ,  %s  )\n", boredrWord[t - 33]);
		}
    }

}

词法分析器编译器的重要组成部分,用于将源代码转换为词法单元。以下是一个简单的词法分析器C语言代码: ```c #include <stdio.h> #include <stdlib.h> #include <ctype.h> #define MAX_LENGTH 100 typedef enum { INT, FLOAT, IDENTIFIER, ADD_OP, SUB_OP, MUL_OP, DIV_OP, ASSIGN_OP, SEMICOLON, INVALID } TokenType; typedef struct { TokenType type; char lexeme[MAX_LENGTH]; } Token; Token getNextToken(FILE* fp) { Token token; char ch; int lexemeIndex = 0; token.type = INVALID; // 忽略空格和换行符 while ((ch = fgetc(fp)) != EOF && isspace(ch)); if (ch == EOF) { token.lexeme[0] = '\0'; return token; } // 处理整数和浮点数 if (isdigit(ch)) { token.type = INT; while (isdigit(ch)) { if (lexemeIndex >= MAX_LENGTH - 1) { printf("Error: Token length exceeds the maximum limit.\n"); exit(1); } token.lexeme[lexemeIndex++] = ch; ch = fgetc(fp); } if (ch == '.') { token.type = FLOAT; token.lexeme[lexemeIndex++] = ch; ch = fgetc(fp); while (isdigit(ch)) { if (lexemeIndex >= MAX_LENGTH - 1) { printf("Error: Token length exceeds the maximum limit.\n"); exit(1); } token.lexeme[lexemeIndex++] = ch; ch = fgetc(fp); } } token.lexeme[lexemeIndex] = '\0'; ungetc(ch, fp); return token; } // 处理标识符和关键字 if (isalpha(ch)) { while (isalnum(ch)) { if (lexemeIndex >= MAX_LENGTH - 1) { printf("Error: Token length exceeds the maximum limit.\n"); exit(1); } token.lexeme[lexemeIndex++] = ch; ch = fgetc(fp); } token.lexeme[lexemeIndex] = '\0'; ungetc(ch, fp); // 检查是否是关键字 if (strcmp(token.lexeme, "if") == 0 || strcmp(token.lexeme, "else") == 0 || strcmp(token.lexeme, "while") == 0) { token.type = KEYWORD; } else { token.type = IDENTIFIER; } return token; } // 处理操作符和分号 switch (ch) { case '+': token.type = ADD_OP; break; case '-': token.type = SUB_OP; break; case '*': token.type = MUL_OP; break; case '/': token.type = DIV_OP; break; case '=': token.type = ASSIGN_OP; break; case ';': token.type = SEMICOLON; break; } token.lexeme[0] = ch; token.lexeme[1] = '\0'; return token; } int main() { FILE* fp = fopen("input.txt", "r"); Token token; if (fp == NULL) { printf("Error: Failed to open the input file.\n"); return 1; } while ((token = getNextToken(fp)).type != INVALID) { printf("Type: %d, Lexeme: %s\n", token.type, token.lexeme); } fclose(fp); return 0; } ``` 以上的代码实现了一个简单的词法分析器,能够识别整数、浮点数、标识符、运算符和分号。
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值