1、词法分析功能
输入:所给文法的源程序字符串。
输出:二元组(syn,token或sum)构成的序列。其中,
syn为单词种别码。
Token为存放的单词自身字符串。
Sum为整型常量。
具体实现时,可以将单词的二元组用结构进行处理。
2、待分析的C语言子集的词法
1)关键字
main if then while do static int double struct break else long switch case typedef char return const float short continue for void default sizeof
所有的关键字都是小写。
2)运算符和界符
+ - * / : < <> <= > >= = ; ( ) #
3)其他标记ID和NUM
通过以下正规式定义其他标记:
ID→letter(letter|digit)*
NUM→digit digit*
letter→a|…|z|A|…|Z
digit→0|…|9…
4)空格由空白、制表符和换行符组成
空格一般用来分隔ID、NUM、专用符号和关键字,词法分析阶段通常被忽略。
3、各种单词符号对应的种别码
表1 各种单词符号的种别码
单词符号 种别码 单词符号 种别码 单词符号 种别码 单词符号 种别码 单词符号 种别码
# 0 else 11 for 22 == 33 ) 44
main 1 long 12 void 23 < 34 { 45
if 2 switch 13 default 24 <> 35 } 46
then 3 case 14 sizeof 25 <= 36 , 47
while 4 typedef 15 ID 26 > 37
do 5 char 16 NUM 27 >= 38
static 6 return 17 + 28 = 39
int 7 const 18 - 29 [ 40
double 8 float 19 * 30 ] 41
struct 9 short 20 / 31 ; 42
break 10 continue 21 ** 32 ( 43
上述为一些基本符号,可以自己另加。
状态转换图:
代码:
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <string>
using namespace std;
FILE *fp; // 文件指针,处理源文件
char *ScanBuffer; // 扫描缓冲区,一分为二,每次一半存放预处理后的字符串
int capacity; // 扫描缓冲区容量
int BufferFlag; // 状态标志,0 表示使用缓冲区的左半区,1 表示使用右半区
int PreprocessFlag; // 预处理的状态标志
string rwtab[25]={"main","if","then","while","do"," static",
"int"," double","struct","break","else","long","switch",
"case","typedef","char","return","const","float","short",
"continue","for","void","default","sizeof"};
// init fp, ScanBuffer, BufferFlag,SP,EP,PreprocessFlag
void init(){
fp = fopen("test.txt","r");
if(fp == NULL) exit(0);
capacity = 120;
ScanBuffer = new char[capacity]; // 缓冲区分配长度 :120。
memset(ScanBuffer,0,sizeof(ScanBuffer));
BufferFlag = 0;
PreprocessFlag = 0;
}
void close(){
fclose(fp);
delete ScanBuffer;
}
void Preprocess(){
//参见词法分析一
}
/*
分析器变量
SP: 分析器的起点指示器
EP: 分析器的搜索指示器
syn: 种别编码
token: 存放一个处理的字符串
sum: 常量
status:状态
*/
int SP = 0; int EP = 0;
int syn = -1; string token = "";
int sum = 0; int status = 0;
// 字母
int IsLetter(char c){
if(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') return 1;
return 0;
}
//数字
int IsDigit(char c){
if(c >= '0' && c <= '9') return 1;
return 0;
}
// 关键字或者是变量
void Key(){
token = "";
if(SP < EP){
for(int i = SP; i < EP; i++) token.push_back(ScanBuffer[i]);
}else{
for(int i = SP; i < capacity; i++) token.push_back(ScanBuffer[i]);
for(int i = 0; i < EP; i++) token.push_back(ScanBuffer[i]);
}
for(int i = 0; i < 25; i++) if(token.compare(rwtab[i]) == 0) {syn = i+1; return;}
syn = 26; return;
}
void Number(){
sum = 0;
if(SP < EP){
for(int i = SP; i < EP; i++) sum = sum * 10 + ScanBuffer[i] - '0';
}else{
for(int i = SP; i < capacity; i++) sum = sum * 10 + ScanBuffer[i] - '0';
for(int i = 0; i < EP; i++) sum = sum * 10 + ScanBuffer[i] - '0';
}
syn = 27;
}
// 不合法变量的处理,例如以数字开头的变量
void InvalidToken(){
token = "";
if(SP < EP){
for(int i = SP; i < EP; i++) token.push_back(ScanBuffer[i]);
}else{
for(int i = SP; i < capacity; i++) token.push_back(ScanBuffer[i]);
for(int i = 0; i < EP; i++) token.push_back(ScanBuffer[i]);
}
while(IsLetter(ScanBuffer[EP]) || IsDigit(ScanBuffer[EP])) {token.push_back(ScanBuffer[EP]); EP++;}
}
// 单个字符 ,例如‘<’
void OneCharacter(){
switch(status){
case 11: syn = 39; break;
case 16: syn = 30; break;
case 22: syn = 34; break;
case 24: syn = 37; break;
}
}
// 双字符 ,例如‘<=’
void TwoCharacter(){
switch(status){
case 12: syn = 33; break;
case 17: syn = 32; break;
case 20: syn = 35; break;
case 21: syn = 36; break;
case 25: syn = 38; break;
}
}
void CharacterMatch(char ch){
switch(ch){
case ' ': break;
case ';': {status = 5; syn = 42; break;}
case '(': {status = 6; syn = 43; break;}
case ')': {status = 7; syn = 44; break;}
case '[': {status = 8; syn = 40; break;}
case ']': {status = 9; syn = 41; break;}
case '{': {status = 27;syn = 45; break;}
case '}': {status = 28;syn = 46; break;}
case ',': {status = 29;syn = 47; break;}
case '+': {status = 13;syn = 28; break;}
case '-': {status = 14;syn = 29; break;}
case '/': {status = 18;syn = 31; break;}
case '#': {status = 26;syn = 0; break;}
case '=': {status = 10; break;}
case '*': {status = 15; break;}
case '<': {status = 19; break;}
case '>': {status = 23; break;}
default: {status = -1; break;}
}
}
void StatusHandle(){
switch(status){
case -2: {
InvalidToken(); cout<<"invalid token: "<<token<<endl; status = 0; SP = EP;break;
}
case -1: {
printf("invalid character: %c\n",ScanBuffer[EP]); EP++; status = 0; SP = EP;break;
}
case 1: case 3: case 10: case 15: case 19: case 23:{
EP++; break;
}
case 0:{
EP++; SP = EP; break;
}
case 2:{
Key(); cout<<"("<<syn<<","<<token<<")"<<endl; SP = EP; status = 0; break;
}
case 4:{
Number();cout<<"("<<syn<<","<<sum<<")"<<endl; SP = EP; status = 0; break;
}
case 5:case 6:case 7:case 8:case 9:case 13:case 14:case 18:case 26:case 27:case 28:case 29:{
cout<<"("<<syn<<","<<ScanBuffer[SP]<<")"<<endl; EP++; SP = EP; status = 0; break;
}
case 11: case 16: case 22: case 24:{
OneCharacter(); cout<<"("<<syn<<","<<ScanBuffer[SP]<<")"<<endl; SP++; EP = SP; status = 0; break;
}
case 12: case 17: case 20: case 21: case 25:{
TwoCharacter();
cout<<"("<<syn<<","<<ScanBuffer[SP]<<ScanBuffer[EP]<<")"<<endl;
EP++; SP = EP;
status = 0; break;
}
}
}
// 状态转换图
void Analyse(char ch){
switch(status){
case 0:{
if(IsLetter(ch)) status = 1;
else if(IsDigit(ch)) status = 3;
else CharacterMatch(ch);
break;
}
case 1:{
if(IsLetter(ch) || IsDigit(ch)) break;
else status = 2; break;
}
case 3:{
if(IsDigit(ch)) break;
else if(IsLetter(ch)){
status = -2; break;
}else {
status = 4; break;
}
}
case 10:{
if(ch == '=') {status = 12; break;}
else status = 11; break;
}
case 15:{
if(ch == '*') {status = 17; break;}
else status = 16; break;
}
case 19:{
if(ch == '>') {status = 20; break;}
else if(ch == '='){status = 21; break;}
else status = 23;break;
}
case 23:{
if(ch == '=') {status = 25; break;}
else status = 24; break;
}
}
StatusHandle();
}
void Analyzer(){
while(!feof(fp)){
Preprocess(); BufferFlag = BufferFlag ^ 1;
if(BufferFlag){
EP = 0; // 使用左半缓冲区
if(SP == capacity) SP = 0; // 防止SP越界
while(ScanBuffer[EP] != -1 && EP < capacity/2){
Analyse(ScanBuffer[EP]);
}
}else{
while(ScanBuffer[EP] != -1 && EP < capacity){
Analyse(ScanBuffer[EP]);
}
}
}
}
int main(){
init();
Analyzer();
close();
return 0;
}
测试用例:
// 单行注释
/*
多行注释
*/
int ArraySum(int **a, int m, int n){
int 9sum = 0;
for(int i = 0; i < m; i = i + 1){
for(int j = 0; j < n; j = j + 1){
sum = sum + a[i][j];
}
}
return sum;
}
测试结果: