词法分析器
(一)基本内容
实现一个可以识别C语言(子集)词法的程序,识别源代码中的每个词,并且将结果保存下来。
可识别的词包括
头文件:以#include开头的过滤为头文件
保留字:“auto”, “break”, “case”, “char”, “const”, “continue”,
“default”, “do”, “double”, “else”, “enum”, “extern”,“float”,
“for”, “goto”, “if”, “int”, “long”,“register”,“return”,
“short”, “signed”, “sizeof”, “static”,“struct”,“switch”,
“typedef”, “union”, “unsigned”, “void”,“volatile”,“while”
界符: “+”, “-”, “*”, “/”, “<”, “<=”, “>”, “>=”, “=”, “==”,
“!=”, “;”, “(”, “)”, “^”, “,”, “”", “’”, “#”, “&”,
“&&”, “|”, “||”, “%”, “~”, “<<”, “>>”, “[”, “]”, “{”,
“}”, “\”, “.”, “?”, “:”, “!”
标识符:未出现在保留字中且符合标识符格式的记为标识符
(二)算法思路
将txt格式的源代码读入数组,
经过过滤函数过滤(应该为替换头文件,为了简化将头文件过滤掉)掉头文件,同时在屏幕打印出这个是头文件,过滤掉换行符制表符,过滤掉单行注释和多行注释,将过滤后的程序覆盖到之前的数组,并在最后加入结束符号,
进行扫描,在扫描过程中过滤空格,并通过事先建立好的DFA,来判定将要加入临时数组中的词,到不符合DFA的字符时则停止,此时返回和该临时数组相符的种别码,如:标识符种别码为100,保留字种别码为1-32等等,
通过种别码判断,如果是标识符,则在事先建立好的标识符数组中查找是否已经添加过该标识符,如果未添加过则在上一个标识符之后加入该标识符,同时打印到屏幕上,并且输出入新的目标文件。
(三)最终结果
源代码文件:
屏幕打印结果:
输出文件:
(四)备注
由于该编译器只有词法分析器部分,所以输出的二元组并未完全按照词法分析器的规则,为了便于查看输出状况用以调试,所以输出如上图,将标识符等类作为二元组的第一个值,将其本身作为第二个值输出。
(五)程序代码
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
int t = -1;//种别码
int pProject = 0;//源程序指针
//保留字
static char reserveWord[32][10] =
{
"auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern","float",
"for", "goto", "if", "int", "long","register","return",
"short", "signed", "sizeof", "static","struct","switch",
"typedef", "union", "unsigned", "void","volatile","while"
};
//界符~运算符
static char boredrWord[36][10] =
{
"+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
"!=", ";", "(", ")", "^", ",", "\"", "\'", "#", "&",
"&&", "|", "||", "%", "~", "<<", ">>", "[", "]", "{",
"}", "\\", ".", "\?", ":", "!"
};
//标识符
static char identifier[1000][50] = {""};
//过滤源程序函数
int filtrationResource( char resourceProject[] , int end )
{
char temp[10000];//暂存过滤后的程序
int count = 0;//过滤后的源程序指针
int i;
for( i = 0 ; i < end ; i++ )
{
//过滤头文件
if ( resourceProject[i] == '#' && resourceProject[i + 1] == 'i' && resourceProject[i + 2] == 'n' && resourceProject[i + 3] == 'c'
&& resourceProject[i + 4] == 'l' && resourceProject[i + 5] == 'u' && resourceProject[i + 6] == 'd' && resourceProject[i + 7] == 'e' )
{
printf("( 头文件 , ");
while( resourceProject[i] !='<')
{
i++;
}
while( resourceProject[i] != '\n' )
{
printf("%c",resourceProject[i++]);
}
printf(" )\n");
}
//删除单行注释
if ( resourceProject[i] == '/' && resourceProject[i + 1] == '/')
{
while( resourceProject[i] != '\n' )
{
i++;
}
}
//删除多行注释
if ( resourceProject[i] == '/' && resourceProject[i + 1] == '*')
{
i = i + 2;
while( resourceProject[i] != '*' && resourceProject[i + 1] != '/')
{
i++;
if( i == end )
{
break;
}
}
i = i + 2;
}
//删除换行符 制表符
if ( resourceProject[i] != '\n' && resourceProject[i] != '\t')
{
temp[count] = resourceProject[i];
count++;
}
}
temp[count] ='$';//末位增加结束符
strcpy( resourceProject , temp );
return count;//更新当前源程序数组的末尾指针
}
//判断是否为数字
int IsDigit(char digit)
{
if (digit >= '0'&&digit <= '9')
{
return 1;
}
else
{
return 0;
}
}
//判断是否为字母
int IsLetter(char letter)
{//注意C语言允许下划线也为标识符的一部分可以放在首部或其他地方
if ( (letter >= 'a'&&letter <= 'z') || (letter >= 'A'&&letter <= 'Z') || letter=='_')
{
return 1;
}
else
{
return 0;
}
}
//查找保留字
int searchReserve(char reserveWord[][10], char s[])
{
int i;
for ( i = 0; i < 32; i++ )
{
if (strcmp(reserveWord[i], s) == 0)
{ //若成功查找,则返回种别码
i++;
return i;//返回种别码
}
}
return -1;
}
int Scaning( char resourceProject[] , char token[] )
{
int i, count = 0;//count用来做token[]的指示器,收集有用字符
char ch;//作为判断使用
ch = resourceProject[pProject];
while (ch ==' ')
{//过滤空格,防止程序因识别不了空格而结束
pProject++;
ch = resourceProject[pProject];
}
for (i = 0; i<20; i++)
{//每次收集前先清零
token[i] = '\0';
}
if (IsLetter(resourceProject[pProject]))
{//开头为字母
token[count] = resourceProject[pProject];//收集
count++;
pProject++;//下移
while (IsLetter(resourceProject[pProject]) || IsDigit(resourceProject[pProject]))
{//后跟字母或数字
token[count++] = resourceProject[pProject];//收集
pProject++;//下移
}//多读了一个字符既是下次将要开始的指针位置
token[count] = '\0';
t = searchReserve(reserveWord, token);//查表找到种别码
if (t == -1)
{//若不是保留字则是标识符
t = 100;//标识符种别码
}
return t;
}
else if (IsDigit(resourceProject[pProject]))
{//首字符为数字
while (IsDigit(resourceProject[pProject]))
{//后跟数字
token[count] = resourceProject[pProject];//收集
count++;
pProject++;
}//多读了一个字符既是下次将要开始的指针位置
token[count] = '\0';
return t;
}
else if (ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == ';' || ch == '(' || ch == ')' || ch == '^'
|| ch == ',' || ch == '\"' || ch == '\'' || ch == '~' || ch == '#' || ch == '%' || ch == '['
|| ch == ']' || ch == '{' || ch == '}' || ch == '\\' || ch == '.' || ch == '\?' || ch == ':')
{//若为运算符或者界符,查表得到结果
token[0] = resourceProject[pProject];
token[1] = '\0';//形成单字符串
for (i = 0; i<36; i++)
{//查运算符界符表
if (strcmp(token, boredrWord[i]) == 0)
{
t = 33 + i;//获得种别码,使用了一点技巧,使之呈线性映射
break;//查到即推出
}
}
pProject++;//指针下移,为下一扫描做准备
return t;
}
else if (resourceProject[pProject] == '<')
{// <,<=,<<
pProject++;//后移,超前搜索
if (resourceProject[pProject] == '=')
{
t = 38;
}
else if (resourceProject[pProject] == '<')
{//左移
pProject--;
t = 58;
}
else
{
pProject--;
t = 37;
}
pProject++;//指针下移
return t;
}
else if (resourceProject[pProject] == '>')
{// >,>=,>>
pProject++;
if (resourceProject[pProject] == '=')
{
t = 40;
}
else if (resourceProject[pProject] == '>')
{
t = 59;
}
else
{
pProject--;
t = 39;
}
pProject++;
return t;
}
else if (resourceProject[pProject] == '=')
{// =,==
pProject++;
if (resourceProject[pProject] == '=')
{
t = 42;
}
else
{
pProject--;
t = 41;
}
pProject++;
return t;
}
else if (resourceProject[pProject] == '!')
{// !,!=
pProject++;
if (resourceProject[pProject] == '=')
{
t = 43;
}
else
{
t = 68;
pProject--;
}
pProject++;
return t;
}
else if (resourceProject[pProject] == '&')
{// &,&&
pProject++;
if (resourceProject[pProject] == '&')
{
t = 53;
}
else
{
pProject--;
t = 52;
}
pProject++;
return t;
}
else if (resourceProject[pProject] == '|')
{// |,||
pProject++;
if (resourceProject[pProject] == '|')
{
t = 55;
}
else
{
pProject--;
t = 54;
}
pProject++;
return t;
}
else if (resourceProject[pProject] == '$')
{//结束符
t = 0;//种别码为0
return t;
}
else
{//不能被以上词法分析识别,则出错。
printf("error:there is no exist %c00 \n", ch);
exit(0);
}
}
int main()
{
//读取源程序
char resourceProject[10000];
char token[20] = {0};
int i;
int end = 0;//数组末尾下标
FILE *fp;
if ( ( fp = fopen ( "E:\\resourceProject.txt", "r" ) ) == NULL )
{
printf("未找到源程序文档!");
return 0;
}
//将源程序读入resourceProject数组
resourceProject[ pProject ] = fgetc(fp);
while(resourceProject[ pProject ] != EOF )
{
pProject++;
resourceProject[ pProject ] = fgetc(fp);
}
fclose(fp);
end = filtrationResource( resourceProject , pProject );
pProject = 0;
if ( ( fp = fopen ( "E:\\targetProject.txt", "w" ) ) == NULL )
{
printf("未找到输出文档!");
return 0;
}
while (t != 0)
{
//启动扫描
t = Scaning(resourceProject, token);
if (t == 100)
{
for (i = 0; i<1000; i++)
{//插入标识符表中
if (strcmp(identifier[i], token) == 0)
{//已在表中
break;
}
if (strcmp(identifier[i], "") == 0)
{//查找空间
strcpy(identifier[i], token);
break;
}
}
printf("( 标识符 , %s )\n", token);
fprintf(fp,"( 标识符 , %s )\n", token);
}
else if (t >= 1 && t <= 32)
{
printf("( 保留字 , %s )\n", reserveWord[t - 1]);
fprintf(fp,"( 保留字 , %s )\n", reserveWord[t - 1]);
}
else if (t == 99)
{
printf("( 常数 , %s )\n", token);
fprintf(fp,"( 常数 , %s )\n", token);
}
else if (t >= 33 && t <= 68)
{
printf("( 界符 , %s )\n", boredrWord[t - 33]);
fprintf(fp,"( 界符 , %s )\n", boredrWord[t - 33]);
}
}
}