词法分析器

作者:朱鑫

邮箱:zhuxin@nwsuaf.edu.cn

/*C语言词法分析器,从三个方面处理:关键字,运算符,界符。并且将输出,符号,常数输出到文件,可以忽略同一行的注释
//-1 error,代表错误

//保留字
1auto       2break      3case       4char       5const          6continue       7default        8do
9double     10else      11enum      12extern    13float         14for           15goto          16if
17int       18long      19register  20return    21short         22signed        23static        24sizeof
25struct    26switch    27typedef   28union     29unsigned      30void          31volatile      32while

//运算符
33= 34==
35+ 36++ 37+=
38- 39-- 40-= 41->
42* 43*=
44/ 45/=
46% 47%=
48(
49)
50[
51]
52,
53> 54>> 55>>= 56>=
57< 58<< 59<<= 60<=
61! 62!=
63^ 64^=
65& 66&& 67&=
68| 69|| 70|=
71~
72? :
73.

//界符
74{
75}
76;
77\

//注释
78 //

//79标示符

//80常数

*/
#include<iostream>
#include<fstream>
#include<cstdlib>
using namespace std;
string Keyword[]={"","auto","break","case","char","const","continue","default","do",
                     "double","else","enum","extern","float","for","goto","if",
                     "int","long","register","return","short","signed","static","sizeof",
                     "struct","switch","typedef","union","unsigned","void","volatile","while"
                 };
string ID[1000];                        //符号表
int indexOfID=0;
string Const[1000];                     //常数表
int indexOfConst=0;
bool IsLetter(char ch);                 //判断是不是字母或下划线
bool IsDigit(char ch);                  //判断是不是数字
int Reserve(string strToken);           //查询是否是关键字,不是的话返回0,是的话返回ID(二分查找)
void insertID(string strToken);         //插入符号表
void insertConst(string strToken);      //插入常数表
int main(void)
{
    int code=0,lineAt=0;                //ID号及行号
    string line;                        //一行
    int i;
    int length=0;
    string strToken="";
    char ch;
    ifstream in;                        //从文件中读取
    ofstream out;                       //输出到文件
    ofstream IDOut;                     //符号表
    ofstream ConstOut;                  //常数表
    in.open("in.txt",ios::in);
    if(!in)
    {
        cerr<<"File open or create error!"<<endl;
        exit(1);
    }
    out.open("out.txt",ios::out);
    IDOut.open("IDOut.txt",ios::out);
    ConstOut.open("ConstOut.txt",ios::out);
    while(getline(in,line))
    {
        lineAt++;
        if((length=line.length())==0)
        {
            continue;
        }
        i=0;
        while(i<=length)
        {
            while(line[i]==' '||line[i]=='\t'||line[i]=='\n')
            {
                i++;
            }
            ch=line[i];
            if(ch=='\0')
            {
                break;
            }
            strToken+=ch;
            //字母
            if(IsLetter(ch))
            {
                while( IsLetter(line[++i]) || IsDigit(line[i]) )
                {
                    strToken+=line[i];
                }
                i--;
                code=Reserve(strToken);
                if(code==0)
                {
                    code=79;
                    insertID(strToken);
                }
            }
            //数字
            else if(IsDigit(ch))
            {
                while(IsDigit(line[++i]))
                {
                    strToken+=line[i];
                }
                i--;
                code=80;
                insertConst(strToken);
            }
            //等号
            else if(ch=='=')
            {
                if(line[++i]=='=')
                {
                    code=34;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=33;
                }
            }
            //加号
            else if(ch=='+')
            {
                if(line[++i]=='+')
                {
                    code=36;
                    strToken+='+';
                }
                else if(line[i]=='=')
                {
                    code=37;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=35;
                }
            }
            //减号
            else if(ch=='-')
            {
                if(line[++i]=='-')
                {
                    code=39;
                    strToken+='-';
                }
                else if(line[i]=='=')
                {
                    code=40;
                    strToken+='=';
                }
                else if(line[i]=='>')
                {
                    code=41;
                    strToken+='>';
                }
                else
                {
                    i--;
                    code=38;
                }
            }
            //乘号
            else if(ch=='*')
            {
                if(line[++i]=='=')
                {
                    code=43;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=42;
                }
            }
            //除号
            else if(ch=='/')
            {
                if(line[++i]=='=')
                {
                    code=45;
                    strToken+='=';
                }
                else if(line[i]=='/')
                {
                    code=78;
                    strToken+='/';
                    cout<<"(    "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<strToken<<"\""<<"  )"<<endl;
                    out<<"( "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<strToken<<"\""<<"  )"<<endl;
                    strToken="";
                    break;
                }
                else
                {
                    i--;
                    code=44;
                }
            }
            //
            else if(ch=='%')
            {
                if(line[++i]=='=')
                {
                    code=47;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=46;
                }
            }
            //括号
            else if(ch=='(')
            {
                code=48;
            }
            else if(ch==')')
            {
                code=49;
            }
            else if(ch=='[')
            {
                code=50;
            }
            else if(ch==']')
            {
                code=51;
            }
            else if(ch==',')
            {
                code=52;
            }
            //大于号
            else if(ch=='>')
            {
                if(line[++i]=='>')
                {
                    strToken+='>';
                    if(line[++i]=='=')
                    {
                        code=55;
                        strToken+='=';
                        i++;
                    }
                    else
                    {
                        i--;
                        code=54;
                    }
                }
                else if(line[i]=='=')
                {
                    code=56;
                    strToken+='=';
                    i++;
                }
                else
                {
                    i--;
                    code=53;
                }
            }
            //小于号
            else if(ch=='<')
            {
                if(line[++i]=='<')
                {
                    strToken+='<';
                    if(line[++i]=='=')
                    {
                        code=59;
                        strToken+='=';
                    }
                    else
                    {
                        i--;
                        code=58;
                    }
                }
                else if(line[i]=='=')
                {
                    code=60;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=57;
                }
            }
            //非号
            else if(ch=='!')
            {
                if(line[++i]=='=')
                {
                    code=62;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=61;
                }
            }
            //
            else if(ch=='^')
            {
                if(line[++i]=='=')
                {
                    code=64;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=63;
                }
            }
            //
            else if(ch=='&')
            {
                if(line[++i]=='&')
                {
                    code=66;
                    strToken+='&';
                }
                else if(line[i]=='=')
                {
                    code=67;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=65;
                }
            }
            //
            else if(ch=='|')
            {
                if(line[++i]=='|')
                {
                    code=69;
                    strToken+='|';
                }
                else if(line[i]=='=')
                {
                    code=70;
                    strToken+='=';
                }
                else
                {
                    i--;
                    code=68;
                }
            }
            //
            else if(ch=='~')
            {
                code=71;
            }
            else if(ch=='?')
            {
                code=72;
            }
            else if(ch=='.')
            {
                code=73;
            }
            else if(ch=='{')
            {
                code=74;
            }
            else if(ch=='}')
            {
                code=75;
            }
            else if(ch==';')
            {
                code=76;
            }
            else if(ch=='\\')
            {
                code=77;
            }
            else
            {
                code=-1;
            }
            i++;
            if(strToken.length()>1)
            {
                cout<<"(    "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<strToken<<"\""<<"  )"<<endl;
                out<<"( "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<strToken<<"\""<<"  )"<<endl;
            }
            else
            {
                cout<<"(    "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<ch<<"\""<<"    )"<<endl;
                out<<"( "<<lineAt<<","<<"\""<<code<<"\""<<","<<"\""<<ch<<"\""<<" )"<<endl;
            }
            strToken="";
        }
    }
    for(int i=0;i<indexOfID;i++)
    {
        IDOut<<i<<"\t"<<ID[i]<<endl;
    }
    for(int i=0;i<indexOfConst;i++)
    {
        ConstOut<<i<<"\t"<<Const[i]<<endl;
    }
    in.close();
    out.close();
    IDOut.close();
    ConstOut.close();
    return 0;
}
bool IsLetter(char ch)
{
    if( (ch>='a'&&ch<='z') || (ch>='A'&&ch<='Z') || ch=='_')
    {
        return true;
    }
    return false;
}
bool IsDigit(char ch)
{
    if(ch>='0'&&ch<='9')
    {
        return true;
    }
    return false;
}
int Reserve(string strToken)
{
    int left=1;
    int right=32;
    int middle;
    while(left<=right)
    {
        middle=(left+right)/2;
        if(Keyword[middle]==strToken)
        {
            return middle;
        }
        else if(Keyword[middle]>strToken)
        {
            right=middle-1;
        }
        else
        {
            left=middle+1;
        }
    }
    return 0;
}
void insertID(string strToken)
{
    for(int i=0;i<indexOfID;i++)
    {
        if(strToken==ID[i])
        {
            return ;
        }
    }
    ID[indexOfID++]=strToken;
}
void insertConst(string strToken)
{
    for(int i=0;i<indexOfConst;i++)
    {
        if(strToken==Const[i])
        {
            return ;
        }
    }
    Const[indexOfConst++]=strToken;
}

 
实验1 词法分析 一、 实验目的 调试并完成一个词法分析程序,加深对词法分析原理的理解。 二、 实验要求 1、 待分析的简单语言词法 (1) 关键字: begin if then while do end 所有关键字都是小写。 (2) 运算符和界符: := + – * / < <= <> > >= = ; ( ) # (3) 其他单词是标识符(ID)和整型常数(NUM),通过以下正规式定义: ID=letter(letter| digit)* NUM=digit digit * (4)空格由空白、制表符和换行符组成。空格一般用来分隔ID、NUM,运算符、界符和关键字,词法分析阶段通常被忽略。 2、 各种单词符号对应的种别码 单词符号 种别码 单词符号 种别码 begin 1 : 17 if 2 := 18 then 3 > 20 while 4 <> 21 do 5 <= 22 end 6 < 23 letter(letter| digit)* 10 >= 24 digit digit * 11 = 25 * 13 ; 26 / 14 ( 27 + 15 ) 28 - 16 # 0 3、 词法分析程序的功能 输入:所给文法的源程序字符串。 输出:二元组(syn,token或sum)构成的序列。 其中:syn为单词种别码; token为存放的单词自身字符串; sum为整型常数。 三、结果验证 给定源程序 begin x:=9; if x>0 then x:=2*x+1/3; end# 输出结果。 四、源程序代码如下: #include<stdio.h> #include<string.h> #include<iostream.h> char prog[80],token[8]; char ch; int syn,p,m=0,n,sum=0; //p是缓冲区prog的指针,m是token的指针 char *rwtab[6]={"begin","if","then","while","do","end"}; void scaner() { for(n=0;n<8;n++) token[n]=NULL; ch=prog[p++]; while(ch=='_') 执行语句1; if((判断ch是字母字符的条件)) { m=0; while((ch>='A'&&ch<='Z')||(ch>='a'&&ch<='z')||(ch>='0'&&ch<='9')) { token[m++]=ch; ch=prog[p++];} token[m++]='\0'; p--; syn=10; for(n=0;n<6;n++) if(加入判断条件) {syn=n+1; break; } } else if(判断ch是数字字符的条件)' { sum=0; while(ch>='0'&&ch<='9') { sum=sum*10+ch-'0'; ch=prog[p++]; } p--;执行语句2; } else ………完成剩余程序代码 }
设计思想 (1)程序主体结构部分: 说明部分 %% 规则部分 %% 辅助程序部分 (2)主体结构的说明 在这里说明部分告诉我们使用的LETTER,DIGIT, IDENT(标识符,通常定义为字母开头的字母数字串)和STR(字符串常量,通常定义为双引号括起来的一串字符)是什么意思.这部分也可以包含一些初始化代码.例如用#include来使用标准的头文件和前向说明(forward ,references).这些代码应该再标记"%{"和"%}"之间;规则部分>可以包括任何你想用来分析的代码;我们这里包括了忽略所有注释中字符的功能,传送ID名称和字符串常量内容到主调函数和main函数的功能. (3)实现原理 程序中先判断这个句语句中每个单元为关键字、常数、运算符、界符,对与不同的单词符号给出不同编码形式的编码,用以区分之。 PL/0语言的EBNF表示 <常量定义>::=<标识符>=<无符号整数>; <标识符>::=<字母>={<字母>|<数字>}; <加法运算符>::=+|- <乘法运算符>::=*|/ <关系运算符>::==|#|<|<=|>|>= <字母>::=a|b|…|X|Y|Z <数字>::=0|1|2|…|8|9 三:设计过程 1. 关键字:void,main,if,then,break,int,Char,float,include,for,while,printfscanf 并为小写。 2."+”;”-”;”*”;”/”;”:=“;”:”;”<“;”<=“;”>“;”>=“;”<>“;”=“;”(“;”)”;”;”;”#”为运算符。 3. 其他标记 如字符串,表示以字母开头的标识符。 4. 空格符跳过。 5. 各符号对应种别码 关键字分别对应1-13 运算符分别对应401-418,501-513。 字符串对应100 常量对应200 结束符# 四:举例说明 目标:实现对常量的判别 代码: digit [0-9] letter [A-Za-z] other_char [!-@\[-~] id ({letter}|[_])({letter}|{digit}|[_])* string {({letter}|{digit}|{other_char})+} int_num {digit}+ %% [ |\t|\n]+ "auto"|"double"|"int"|"struct"|"break"|"else"|"long"|"switch"|"case"|"enum"|"register"|"typedef"|"char"|"extern"|"return"|"union"|"const"|"float"|"short"|"unsigned"|"continue"|"for"|"signed"|"void"|"default"|"goto"|"sizeof"|"do"|"if"|"static"|"while"|"main" {Upper(yytext,yyleng);printf("%s,NULL\n",yytext);} \"([!-~])*\" {printf("CONST_string,%s\n",yytext);} -?{int_num}[.]{int_num}?([E][+|-]?{int_num})? {printf("CONST_real,%s\n",yytext);} "0x"?{int_num} {printf("CONST_int,%s\n",yytext);} ","|";"|"("|")"|"{"|"}"|"["|"]"|"->"|"."|"!"|"~"|"++"|"--"|"*"|"&"|"sizeof"|"/"|"%"|"+"|"-"|">"|"<"|">="|"<="|"=="|"!="|"&"|"^"|"|"|"&"|"||"|"+="|"-="|"*="|"/="|"%="|">>="|"<<="|"&="|"^="|"|="|"=" {printf("%s,NULL\n",yytext);} {id} {printf("ID,%s\n",yytext);} {digit}({letter})+ {printf("error1:%s\n",yytext);} %% #include <ctype.h> Upper(char *s,int l) { int i; for(i=0;i<l;i++) { s[i]=toupper(s[i]); } } yywrap() { return 1; } 五:DFA 六:数据测试 七:心得体会 其实匹配并不困难,主要是C++知识要求相对较高,只要把握住指针就好了。 附源程序: #include<iostream.h> #include<stdio.h> #include<stdlib.h> #include<string.h> int i,j,k,flag,number,status; /*status which is use to judge the string is keywords or not!*/ char ch; char words[10] = {" "}; char program[500]; int Scan(char program[]) { char *keywords[13] = {"void","main","if","then","break","int", "char","float","include","for","while","printf", "scanf"}; number = 0; status = 0; j = 0; ch = program[i++]; /* To handle the lettle space ands tab*/ /*handle letters*/ if ((ch >= 'a') && (ch <= 'z' )) { while ((ch >= 'a') && (ch <= 'z' )) { words[j++]=ch; ch=program[i++]; } i--; words[j++] = '\0'; for (k = 0; k < 13; k++) if (strcmp (words,keywords[k]) == 0) switch(k) { case 0:{ flag = 1; status = 1; break; } case 1:{ flag = 2; status = 1; break; } case 2:{ flag = 3; status = 1; break; } case 3:{ flag = 4; status = 1; break; } case 4:{ flag = 5; status = 1; break; } case 5:{ flag = 6; status = 1; break; } case 6:{ flag = 7; status = 1; break; } case 7:{ flag = 8; status = 1; break; } case 8:{ flag = 9; status = 1; break; } case 9:{ flag = 10; status = 1; break; } case 10:{ flag = 11; status = 1; break; } case 11:{ flag = 12; status = 1; break; } case 12:{ flag = 13; status = 1; break; } } if (status == 0) { flag = 100; } } /*handle digits*/ else if ((ch >= '0') && (ch <= '9')) { number = 0; while ((ch >= '0' ) && (ch <= '9' )) { number = number*10+(ch-'0'); ch = program[i++]; } flag = 200; i--; } /*opereation and edge handle*/ else switch (ch) { case '=':{ if (ch == '=') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 401; } else { i--; flag = 402; } break; } case'>':{ if (ch == '>') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 403; } else { i--; flag = 404; } break; } case'<':{ if (ch == '<') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 405; } else { i--; flag = 406; } break; } case'!':{ if (ch == '!') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 407; } else { i--; flag = 408; } break; } case'+':{ if (ch == '+') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 409; } else if (ch == '+') { words[j++] = ch; words[j] = '\0'; flag = 410; } else { i--; flag = 411; } break; } case'-':{ if (ch == '-') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 412; } else if( ch == '-') { words[j++] = ch; words[j] = '\0'; flag = 413; } else { i--; flag = 414; } break; } case'*':{ if (ch == '*') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 415; } else { i--; flag = 416; } break; } case'/':{ if (ch == '/') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 417; } else { i--; flag = 418; } break; } case';':{ words[j] = ch; words[j+1] = '\0'; flag = 501; break; } case'(':{ words[j] = ch; words[j+1] = '\0'; flag = 502; break; } case')':{ words[j] = ch; words[j+1] = '\0'; flag = 503; break; } case'[':{ words[j] = ch; words[j+1] = '\0'; flag = 504; break; } case']':{ words[j] = ch; words[j+1] = '\0'; flag = 505; break; } case'{':{ words[j] = ch; words[j+1] = '\0'; flag = 506; break; } case'}':{ words[j] = ch; words[j+1] = '\0'; flag = 507; break; } case':':{ words[j] = ch; words[j+1] = '\0'; flag = 508; break; } case'"':{ words[j] = ch; words[j+1] = '\0'; flag = 509; break; } case'%':{ if (ch == '%') words[j++] = ch; words[j] = '\0'; ch = program[i++]; if (ch == '=') { words[j++] = ch; words[j] = '\0'; flag = 510; } else { i--; flag = 511; } break; } case',':{ words[j] = ch; words[j+1] = '\0'; flag = 512; break; } case'#':{ words[j] = ch; words[j+1] = '\0'; flag = 513; break; } case'@':{ words[j] = '#'; flag = 0; break; } default:{ flag = -1; break; } } return flag; } main() { i=0; printf("please input a program end with @"); do { ch = getchar(); program[i++] = ch; }while(ch != '@'); i = 0; do{ flag = Scan(program); if (flag == 200) { printf("(%2d,%4d)",flag,number); } else if (flag == -1) { printf("(%d,error)",flag); } else { printf("(%2d,%4s)",flag,words); } }while (flag != 0); system("pause"); }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值