词法分析是编译的基础,需要对程序中的单词进行划分,并生成token文件(主要存符号表的入口地址,以便获取进一步需要的信息),供语法分析阶段使用。同时要生成符号表,包括变量的和常量的,在之后的分析中会不断的查填符号表,将单词的类型,值等各项信息填完整,才能进行运算等操作。采用了较为底层的C语言。结果由于好久没用过了,犯了许多低级的错误,编程过程比较艰辛,下面总结遇到的问题。
1、C语言严格区分字符和字符串,%c和%s不要混用,否则造成异常。Java中经常使用String或StringBuilder,淡化了字符的概念,在C中需要注意。
2、自定义的头文件要用双引号而不是尖括号。
3、'\n'光标移到下一行,在打印源程序时,头文件打印空行,看上去像多打印了一个'\n',其实没错。
4、字符串操作不能直接赋值额,需要搞清楚char型数组和char*的不同。Strcpy是将内容进行拷贝,char型数组用Strcpy。而可以char*=字符串常量,字符串常量表示的是一个地址,可以让char型的指针直接指向该地址,避免空间浪费。
5、在不同的处理函数中,要生成token,并赋值,然后让token的指针的数组指向该位置。但由于函数内的token变量是局部变量,跳出函数则失效,故指针指向的位置的内容将有可能改变,导致在该函数之外往文件中写token数组时乱码。
6、fgets最后读到行末尾,'添加\0,判断时'不是'\n'
7、开辟的数组空间太小导致越界,也会导致乱码。
8、最费脑子的错误就是自己设计的headCh,作为每次识别的第一个字符。因为读文件的时候,文件指针会一直的向后移动,当时为了避免文件指针往回移动的时间开销,就用一个char类型的headCh来保存第一个字符。但是预处理的函数中,需要根据下一个字符判断是否结束预处理,故由headCh保存下一个字符,再读到buffer中时,buffer的第一个字符为下一行的第二个字符,即headCh+buffer的内容才是完整的一行的内容。所以为了不改变程序的处理过程,还要保持headCh始终在buffer[start]之前的一位,而处理程序是根据headCh的值来判断调用哪个处理函数的,故每次处理结束后需要给headCh和start重新赋值。
程序流程图:
Token:
错误处理:
源码:
main.c
#include "global.h"
#include <ctype.h>
int main()
{
char fname[FSIZE];
error = fopen("error.txt", "w+");
if(error==NULL)
{
printf("cannot create error.txt!\n");
}
out = fopen("out.txt", "w+");
if(out==NULL)
{
printf("cannot create out.txt!\n");
}
printf("please input filename: \n");
scanf("%s", fname);
in = fopen(fname, "r");
if(in==NULL)
{
printf("error: cannot open file %s\n", fname);
return -1;
}
headCh = predeal(in);
while(fgets(buf, BSIZE, in)!=NULL)
{
head = headCh;
int len = strlen(buf);
buf[len-1] = '\0';
// printf("buf:$%s$\tline:%d\n", buf, line);
start = 0;
while(headCh != '\0' )
{
while(buf[start]==' ' && headCh == ' ')
{
start++;
}
if(headCh == ' ')
{
if(buf[start] == '\0')break;
headCh = buf[start++];
}
// printf("start = %d\n", start);
// printf("headCH=%c\n",headCh);
if(isalpha(headCh))
{
// printf("ooooooo: DEAL ALPHA\n");
dealAlpha();
}
else if(isdigit(headCh))
{
// printf("ooooooo: DEAL DIGIT\n");
dealDigit();
}
else if(headCh=='/')
{
// printf("ooooooo: DEAL NOTATION\n");
if(dealNotation()==-1)
{
printf("notation too long to analyze, skip this line...\n");
fputc('\n', out);
break;
}
}
else if(isBorder(headCh))
{
// printf("ooooooo: DEAL BORDER\n");
dealBorder();
}
else if(headCh=='\'' || headCh =='"')
{
// printf("ooooooo: DEAL CHAR\n");
dealChar(headCh);
}
else // not available start
{
fprintf(error, "L%d\tcannot analyse %c\n",line,headCh);
headCh = buf[start];
}
start++;
// flag = 1;
}
line++;
flag = 0;
headCh = goBlank(in);
if(isNotation==0)
{
fprintf(out, "%c%s\n", head, buf);
}
isNotation = 0;
}
puts("Everything has done...");
writeToken();
writeVarTable();
writeConTable();
fclose(in);
fclose(out);
fclose(error);
printf("错误日志\t\t\terror.txt\n");
printf("Token文件\t\t\ttoken.txt\n");
printf("无注释头文件的源文件日志\tout.txt\n");
printf("常量符号表\t\t\tconTable.txt\n");
printf("变量符号表\t\t\tvarTable.txt\n");
return 0;
}
/**预处理**/
char predeal(FILE *in)
{
char ch;
ch = goBlank(in);
while(ch == '#')
{
fgets(buf, BSIZE, in);
dealInclude(buf);
line++;
fputc('\n', out);
ch = goBlank(in);
}
printf("headers done...\n");
return ch;
}
/** deal headers, like #include <...>**/
void dealInclude(char *buf)
{
char include[15];
char ch;
int i=9;
strncpy(include, buf, 9);
include[9] = '\0';
//printf("%s%d",include,strlen(include));
if(strcmp(include, "include <")==0)
{
while((ch=buf[i])!='>')
{
i++;
if(ch=='\n')
{
fprintf(error, "L%d\theaders end without '>'\n",line);
break;
}
}
}
else
{
fprintf(error, "L%d\theaders format error\n",line);
}
}
/**step blanks and count line number**/
char goBlank(FILE* in)
{
char ch;
do
{
ch = fgetc(in);
if(ch=='\n')
{
line++;
// printf("goblank\n");
fputc('\n', out);
}
}
while(ch ==' ' || ch =='\n' || ch =='\t');
return ch;
}
/**deal begin with alpha**/
void dealAlpha()
{
int symbol;
int id;
char word[100];
Token token;
VarTable varTable;
int i;
word[0] = headCh;
for(i=start; isdigit(buf[i])||isalpha(buf[i]); i++)
{
word[i-start+1] = buf[i];
}
word[i-start+1] = '\0';
// forward = i;
start = i;
headCh = buf[start];
symbol = isKeyword(word);
/**not keyword**/
if(symbol == -1)
{
/* id = isInVarTable(word);
if(id ==-1) //not in the varTable
{
varTable.id = varTableNum;
strcpy(varTable.name, word);
varTableArray[varTableNum] = varTable;
varTableNum++;
id = varTable.id;
}*/
varTable.id = varTableNum;
strcpy(varTable.name, word);
varTableArray[varTableNum] = varTable;
varTableNum++;
token.symbol = IDN;
sprintf(token.attr, "%d", varTable.id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum] = token;
tokenNum++;
}
/** is keyword**/
else
{
token.symbol = symbol;
strcpy(token.name,word);
strcpy(token.attr,"--");
tokenArray[tokenNum] = token;
tokenNum++;
}
}
/**判断是否是关键字**/
int isKeyword(char * word)
{
int i;
for(i=0; keywordList[i][0]; i++)
{
if(strcmp(word, keywordList[i])==0)
{
return i+256;
}
}
return -1;
}
/**将token数组写入文件**/
void writeToken()
{
FILE* ftoken;
int i=0;
ftoken = fopen("token.txt", "w+");
if(ftoken==NULL)
{
printf("cannot create file token.txt!\n");
}
for(i=0; i<tokenNum; i++)
{
fprintf(ftoken, "%s\t\t(%d, %s)\n", tokenArray[i].name,tokenArray[i].symbol,tokenArray[i].attr);
}
fclose(ftoken);
}
/**将变量符号表写入文件**/
void writeVarTable()
{
FILE* fvarTable;
int i=0;
fvarTable = fopen("varTable.txt", "w+");
if(fvarTable==NULL)
{
printf("cannot create file varTable.txt!\n");
}
for(i=0; i<varTableNum; i++)
{
fprintf(fvarTable, "%s\t\t%d\n", varTableArray[i].name, varTableArray[i].id);
}
fclose(fvarTable);
}
/**将常量符号表写入文件**/
void writeConTable()
{
FILE* fconTable;
int i=0;
fconTable = fopen("conTable.txt", "w+");
if(fconTable==NULL)
{
printf("cannot create file conTable.txt!\n");
}
for(i=0; i<conTableNum; i++)
{
fprintf(fconTable, "%s\t\t%d\n", conTableArray[i].name, conTableArray[i].id);
}
fclose(fconTable);
}
/**deal with digit**/
void dealDigit()
{
int symbol;
int id;
char word[100];
Token token;
ConTable conTable;
int i;
word[0] = headCh;
for(i=start; isdigit(buf[i]); i++)
{
word[i-start+1] = buf[i];
}
if(buf[i]=='.')
{
if(!isdigit(buf[++i]))
{
start = i;
headCh = buf[start];
fprintf(error, "L%d\tunavailabe float\n",line);
return;
}
word[i-start] = '.';
for(; isdigit(buf[i]); i++)
{
word[i-start+1] = buf[i];
}
word[i-start+1] = '\0';
// forward = i;
start = i;
headCh = buf[start];
id = isInConTable(word);
/**不在常量表里,新加项**/
if(id==-1)
{
conTable.id = conTableNum;
strcpy(conTable.name, word);
conTableArray[conTableNum] = conTable;
conTableNum++;
id = conTable.id;
}
token.symbol = FNUM;
sprintf(token.attr, "%d", id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum] = token;
tokenNum++;
}
else
{
word[i-start+1] = '\0';
// forward = i;
start = i;
headCh = buf[start];
id = isInConTable(word);
/**不在常量表里,新加项**/
if(id==-1)
{
conTable.id = conTableNum;
strcpy(conTable.name, word);
conTableArray[conTableNum] = conTable;
conTableNum++;
id = conTable.id;
}
token.symbol = INUM;
sprintf(token.attr, "%d", id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum] = token;
tokenNum++;
}
}
/**处理界符**/
void dealBorder()
{
Token token;
char s[3];
int i;
s[0] = headCh;
s[1] = buf[start];
s[2] = '\0';
if(s[1] != '\0')
{
//deal two border
for(i = 0 ; borderList[i][0] ; i++)
{
if(strcmp(s, borderList[i]) == 0)
{
strcpy(token.name, s);
token.symbol = i+400;
strcpy(token.attr,"--");
tokenArray[tokenNum++] = token;
start++;
headCh = buf[start];
return;
}
}
}
/**处理单界符**/
s[1] = '\0';
for(i = 0 ; borderList[i][0] ; i++)
{
if(strcmp(s, borderList[i]) == 0)
{
strcpy(token.name, s);
token.symbol = i+400;
strcpy(token.attr,"--");
tokenArray[tokenNum++] = token;
// start++;
headCh = buf[start];
return;
}
}
}
int isBorder(char ch)
{
int i;
for(i=0; borderList[i][0]; i++)
{
if(ch==borderList[i][0])
{
return 1;
}
}
return 0;
}
/**处理注释**/
int dealNotation()
{
char ch = buf[start];
int i;
int notationLen=0;
Token token;
//printf("dealNotation: ch = %c\n", ch);
/**除号处理**/
if(ch!='/'&&ch!='*')
{
strcpy(token.name,"/");
strcpy(token.attr,"--");
token.symbol = DIV;
tokenArray[tokenNum++] = token;
start++;
headCh = buf[start];
return 0;
}
if(ch=='/')
{
fputc('\n',out);
isNotation = 1;
headCh='\0';
return 0;
}
else if(ch=='*')
{
for(i=start+1;; i++)
{
if(buf[i]=='\0')
{
fgets(buf, BSIZE, in);
line++;
fputc('\n', out);
start = 0;
headCh = '\0';
i = -1;
}
else
{
while(!(buf[i]=='*'&&buf[i+1]=='/'))
{
i++;
if(++notationLen == LIMIT_NOTATION)
{
fprintf(error, "L%d\tnotation too long\n", line);
isNotation = 1;
return -1;
}
// printf("notationLen = %d\n", notationLen);
if(buf[i+1]=='\0')
{
fgets(buf, BSIZE, in);
line++;
fputc('\n', out);
i = 0;
}
}
fputc('\n', out);
isNotation = 1;
start = i + 2;
headCh = buf[start];
return 0;
}
}
}
}
/**处理字符常量**/
void dealChar(char ch)
{
Token token;
ConTable conTable;
int i = start;
int id;
int j;
char word[100];
word[0] = ch;
if(ch == '\'')//const char
{
if(buf[i]=='\\')//change char
{
for(j=0; j<12; j++)
{
if(buf[i+1]==changeList[j])
{
word[1] = '\\';
word[2] = buf[i+1];
word[3] = '\'';
word[4] = '\0';
strcpy(token.name, word);
strcpy(token.attr, "--");
token.symbol = j+500;
tokenArray[tokenNum++] = token;
start = i+3;
headCh = buf[start];
return;
}
}
/**error: 转义字符不合法**/
if(j==12)
{
fprintf(error, "L%d\tunavailable change char\n", line);
}
}
else if(buf[i+1] != '\'')
{
fprintf(error, "L%d\tthe length of const char is unavailabe\n", line);
for(i = i+2; buf[i]!='\''; i++);
start = i +1;
headCh = buf[start];
return;
}
else
{
word[1] = buf[i];
word[2] = '\'';
word[3] = '\0';
id = isInConTable(word);
/**不在常量表里,新加项**/
if(id==-1)
{
conTable.id = conTableNum;
strcpy(conTable.name, word);
conTableArray[conTableNum++] = conTable;
id = conTable.id;
}
token.symbol = CCHAR;
sprintf(token.attr, "%d", id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum++] = token;
start = i+2;
headCh = buf[start];
return;
}
}
else if(ch == '"')//字符串常量
{
for(; buf[i]!='"'; i++)
{
word[i-start+1] = buf[i];
// printf("buf%d:%s\n", i, buf);
}
word[i - start + 1] = '"';
word[i - start + 2] = '\0';
id = isInConTable(word);
/**不在常量表里**/
if(id==-1)
{
conTable.id = conTableNum;
strcpy(conTable.name, word);
conTableArray[conTableNum++] = conTable;
id = conTable.id;
}
token.symbol = CSTR;
sprintf(token.attr, "%d", id);//change int to string
strcpy(token.name,word);
tokenArray[tokenNum++] = token;
start = i + 1;
headCh = buf[start];
}
}
/**判断是否在变量符号表中,返回位置,不在返回-1**/
int isInVarTable(char *name)
{
int i;
for(i=0; i<varTableNum; i++)
{
if(strcmp(name, varTableArray[i].name)==0)
{
return varTableArray[i].id;
}
}
return -1;
}
/**判断是否在常量符号表中,返回位置,不在返回-1**/
int isInConTable(char *name)
{
int i;
for(i=0; i<conTableNum; i++)
{
if(strcmp(name, conTableArray[i].name)==0)
{
return conTableArray[i].id;
}
}
return -1;
}
global.h
#ifndef GLOBAL_H_INCLUDED
#define GLOBAL_H_INCLUDED
#include <stdio.h>
#include <string.h>
#define BSIZE 1024
#define FSIZE 50
#define TSIZE 1024
#define VTSIZE 1024
#define CTSIZE 1024
#define LIMIT_NOTATION 10
/**define keyword**/
#define INCLUDE 256
#define AUTO 257
#define BREAK 258
#define CASE 259
#define CHAR 260
#define CONST 261
#define CONTINUE 262
#define DEFAULT 263
#define DO 264
#define DOUBLE 265
#define ELSE 266
#define ENUM 267
#define EXTERN 268
#define FLOAT 269
#define FOR 270
#define GOTO 271
#define IF 272
#define INT 273
#define LONG 274
#define REGISTER 275
#define RETURN 276
#define SHORT 277
#define SIGNED 278
#define SIZEOF 279
#define STATIC 280
#define STRUCT 281
#define SWITCH 282
#define TYPEDEF 283
#define UNION 284
#define UNSIGNED 285
#define VOLATILE 286
#define WHILE 287
/**define variables and consts**/
#define IDN 300
#define INUM 301
#define FNUM 302
#define CCHAR 303
#define CSTR 304
/**define border**/
#define PLUS 400
#define MINUS 401
#define MUL 402
#define DIV 403
#define REM 404
#define SEMI 405
#define COM 406
#define BLP 407
#define BRP 408
#define SRP 409
#define SLP 410
#define BIG 411
#define SML 412
#define EQU 413
#define MLP 414
#define MRP 415
#define COL 416
#define QUE 417
#define SIG 418
#define NOT 419
#define AND 420
#define OR 421
#define PP 422
#define MM 423
#define EQEQ 424
#define NOTL 425
#define NOTR 426
#define DECL 427
#define BIGE 428
#define SMLE 429
#define NOTE 430
#define AA 431
#define OO 432
#define ANDE 433
#define MINUE 434
#define MULE 435
#define DIVE 436
#define XOR 437
#define RIGHT 438
#define LEFT 439
#define TURN 440
/**define change char**/
#define CA 500
#define CB 501
#define CF 502
#define CN 503
#define CR 504
#define CT 505
#define CV 506
#define CBSL 507
#define CQUE 508
#define CDQM 509
#define CQM 510
#define ZERO 511
/**structs**/
typedef struct varTable
{
int id;
char name[100];
} VarTable;
typedef struct conTable
{
int id;
char name[100];
} ConTable;
typedef struct token
{
char name[100];
int symbol;
char attr[100];
} Token;
/**variables**/
FILE *error;
FILE *out;
FILE *in;
int line=1;
int tokenNum=0;
int varTableNum=0;
int conTableNum=0;
int start = 0;
int forward = 0;
int flag =0;
int isNotation=0;
char buf[BSIZE];
Token tokenArray[TSIZE];
VarTable varTableArray[VTSIZE];
ConTable conTableArray[CTSIZE];
char headCh;
char head;
char *borderList[] = { "+","-","*","/","%",";",",","{","}",")","(",">","<","=","[","]",":","?","!","&","|",
"++","--","==","/*","*/",":=",">=","<=","!=","&&","||","+=","-=","*=","/=","^",">>","<<","~"
};//by order unnecessary notation
char *keywordList[] = {"include","auto","break","case","char","const","continue","default","do","double",
"else","enum","extern","float","for","goto","if","int","long","register",
"return","short","signed","sizeof","static","struct","switch","typedef",
"union","unsigned","volatile","while",""
};//by order
char changeList[12] = {'a', 'b', 'f','n','r','t','v','\\','?','"','\'','0'};
/**Functions**/
char goBlank(FILE* in);
char predeal(FILE* in);
void dealInclude(char* in);
void dealAlpha();
void dealDigit();
void dealBorder();
int dealNotation();
void dealChar(char ch);
int isKeyword(char *word);
void writeToken();
void writeVarTable();
void writeConTable();
int isBorder(char ch);
int isInVarTable(char *name);
int isInConTable(char *name);
#endif // GLOBAL_H_INCLUDED
程序说明:
1、不考虑自定义头文件,#include < 规定这样开头,尖括号前面只能空一格。
2、标识符长度< 100,否则越界。
3、字符(串)常量,长度不可超过1, 但需要结尾。
4、注释会整行打印空行。
5、错误文件输出到error.txt;常量符号表:conTable.txt;变量符号表:varTable.txt;无头文件和注释的程序out.txt;token输出到token.txt
6、错误处理包括:不合法的浮点数,注释不封闭,转义字符不存在,头文件包含错误,程序中有非法字符。
7、为了显示注释不封闭的错误处理,定义超过10个字符的“/*”没有结尾就算不封闭。可以在global.h中进行更改,缓冲区大小,文件名长度,token, conTable, varTable的数组长度都可定义。由于struct中不是指针,比较占用空间,故不可分析太大的程序,容易数组越界。