C++-c语言词法分析器

该文描述了一个C语言的词法分析器,能够识别关键字、运算符、常量、标识符等,并处理预编译指令、注释和各种类型的错误。词法分析器还实现了高亮输出和错误记录功能,对源代码进行预处理以解决字符串和注释中的问题。
摘要由CSDN通过智能技术生成

一、运行截图

  1. 对于 Test.c 的词法分析结果
  • 在这里插入图片描述
  1. 对于词法分析器本身的源代码的分析结果
  • 在这里插入图片描述

二、主要功能

经过不断的修正和测试代码,分析测试结果,该词法分析器主要实现了以下功能:

1. 识别关键字

  • 实验要求:if else while do for main return int float double char。
  • 数据类型关键字 void unsigned long enum static short signed struct union;
  • 控制语句关键字 continue break switch case default goto;
  • 存储类型关键字 auto static extern register;
  • 其他关键字 const volatile sizeof typedef;
  • 预编译指令 #xxx。

2. 识别运算符

  • 实验要求:= + - * / % < <= > >= != ==
  • 算术运算符 ++ –
  • 逻辑运算符 && || !
  • 位运算符 & | ^ ~ << >>
  • 赋值运算符 += -= *= /= %= <<= >>= &= ^= |=
  • 杂项运算符 , ? : -> .

3. 识别界限符

  • 实验要求:; ( ) { }
  • 其他:[ ]

4. 识别常量

  • 实验要求:无符号十进制整型常量,正规式为:(1-9)(0-9)*
  • 无符号二进制整型常量,正规式为: 0(B|b)(0-1)*
  • 无符号八进制整型常量,正规式为: 0(0-7)*
  • 无符号十六进制整型常量,正规式为: 0(X|x)(0-9|a-f|A-F)*
  • 字符串常量,正规式为: “(all char)*”
  • 字符常量,正规式为: ‘(all char)*’

5. 识别标识符

  • 实验要求:以字母开头,正规式为:letter(letter|digit)*
  • 以下划线开头,正规式为:_ (letter|digit|$ | _) *
  • 以美元符号开头,正规式为:$ (letter|digit|$ | _ ) *

6. 识别其他符号

  • 实验要求:空格符(’ ‘),制表符(’\t’),换行符(‘\n’)
  • 单行注释(“//”)
  • 多行注释(“/*”)
  • 转义字符(‘\’)

特别的,在预处理阶段,识别单/多行注释时,遇到了一个小问题,由于要支持字符串常量和字符常量,而该常量中可能包含//,/*,导致将字符串内容识别为注释,另外,由于需要支持转义字符‘\’的存在,正确识别字符串的结束符号,也成为了一大问题,在多次修改匹配代码后,终于将问题解决。

7. 词法高亮

利用词法分析的token结果,按不同的类型码,对源代码文件进行着色输出。

运算符和界限符:黄色。
关键字:淡蓝色。
无符号二,八,十,十六进制整型常量:淡红色。
标识符:淡绿色。
字符串常量:淡黄色。

8. 词法错误处理

当朴素匹配或者正规匹配失败,或者匹配结束后,回退字符之前,出现非法字符时,对相应的错误进行记录并在分析完毕后进行输出。

目前已处理错误:未找到匹配的界限符()]}),多行注释未找到结束符(*/),出现未定义的字符(@¥`)。

值得注意的一点,当遇到未处理的词法错误时,程序存在可能的崩溃隐患。

三、项目内容

1. 测试代码 Test.c

/**
 * used to test keywords
 * //ohhhhhhhhhhhhhhh
 */
void $Func$ForKey_2(int a, unsigned b)
{
start:	if (1){}else{}
	while (1) { continue; }
	do{}while (1);
	for (;;){          }}
	switch (1){case 1: break;default: break;}
	enum { run };
	goto start;
}

struct FStruct{int val = 0000x1234;};
union FUnion{int val = 0B1 + 0b1 + 01 + 0X1 + 0x1;};
extern i;

// used to test operator
void __FuncForOperation5(const float c, long d, signed e, short f)
{
	typedef int Moota;
	register int i = 1234 + sizeof(Moota) + sizeof("/**/hello world//\\") + sizeof('\a');
	volatile int j = i + 1 - 1 * 1 / 1 % 1 <= 1 < 1 > 1 >= 1 != 1 == 1;
	auto int m = (i++) + (++i) + (i && i || i & i & i | i ^ i << 1) + i >> 1 + ~i;
	m += 1;m -= 2;m *= 3;m / 4;m %= 5;m <<= 6;m >>= 7;m &= 8;m |= 9;m ^= 10;
	m = m ? 1 : 0;
	struct FStruct* Ohhh;Ohhh->val;
	struct FStruct Emmm;Emmm.val;
}

/*
int main(void)
{
	$Func$ForKey_2(1, 1);__FuncForOperation5(1, 1, 1, 1);
	return 0;
}

2. 项目代码 LexicalAnalyzer.cpp

/** Copyright Moota, Private. All Rights Reserved. */

#include <iostream>
#include <map>
#include <stack>
#include <vector>
#include <string>
#include <fstream>
#include "Windows.h"

/**
 * 词法分析器
 * 可以识别的单词种类包括以下部分(括号代表实验要求之外的单词)
 * 
 * (1) 关键字
 * if else while do for main return int float double char
 * (数据类型关键字 void unsigned long enum static short signed struct union)
 * (控制语句关键字 continue break switch case default goto)
 * (存储类型关键字 auto static extern register)
 * (其他关键字 const volatile sizeof typedef)
 * (预编译指令 #xxx)
 *  
 * (2) 运算符
 * = + - * / % < <= > >= != ==
 * (算术运算符 ++ --)
 * (关系运算符)
 * (逻辑运算符 && || !)
 * (位运算符 & | ^ ~ << >>)
 * (赋值运算符 += -= *= /= %= <<= >>= &= ^= |=)
 * (杂项运算符 , ? : -> .)
 * 
 * (3) 界限符
 * ; ( ) { }
 * ([ ])
 * 
 * (4) 常量
 * 正规式为 digit1 digit2*,只考虑无符号十进制整型常量,digit1包括1-9,digit2包括0-9)
 * (无符号二进制整型常量 0(B|b)(0-1)*)
 * (无符号八进制整型常量 0(0-7)*)
 * (无符号十六进制整型常量 0(X|x)(0-9|a-f|A-F)*)
 * (字符串常量 "(all char)*" )
 * (字符常量 '(all char)*' )
 * 
 * (5) 标识符
 * 正规式为 letter(letter|digit)*,区分大小写
 * (_(letter|digit|$)*)
 * ($(letter|digit|$)*)
 * 
 * (6) 其他符号
 * 空格符(' '),制表符('\t'),换行符('\n'),单行多行注释("//","\/\*") 应该在词法分析阶段被忽略
 */
class FLexicalAnalyzer
{
public:
	FLexicalAnalyzer()
	{
		SetConsoleTitle(L"Lexical Analyzer V1.2.6");
		ShowWindow(GetForegroundWindow(), SW_MAXIMIZE);
		SetConsoleOutputCP(65001);
	}

private:
	// 工具函数

	// 字符是否小写
	static inline bool IsLower(char Data)
	{
		return 'a' <= Data and Data <= 'z';
	}

	// 字符是否大写
	static inline bool IsUpper(char Data)
	{
		return 'A' <= Data and Data <= 'Z';
	}

	// 字符是否是字母字符
	static inline bool IsLetter(char Data)
	{
		return IsLower(Data) || IsUpper(Data);
	}

	// 字符是否是数字字符
	static inline bool IsDigit(char Data)
	{
		return '0' <= Data and Data <= '9';
	}

	// 读取文件
	std::string ReadFile(const std::string& FilePath)
	{
		std::string Result;
		std::ifstream InputFile(FilePath, std::ios::in);
		if (!InputFile.is_open())
		{
			std::cerr << "The file failed to open, perhaps you need to change the file path." << "\n";
			return Result;
		}
		char Ch;
		while (InputFile.peek() != EOF)
		{
			InputFile.get(Ch);
			Result += Ch;
		}
		InputFile.close();
		TokenizeData.FilePath = FilePath;
		return Result;
	}

	// 保存文件
	static void SaveFile(const std::string& Data, const std::string& FilePath)
	{
		std::ofstream OutputFile(FilePath, std::ios::out);
		OutputFile << Data;
		OutputFile.close();
	}

private:
	std::map<std::string, int> KeywordMap;	// 关键字符号表
	std::map<std::string, int> RegularMap;	// 正规(常量,标识符)符号表
	std::map<std::string, int> SpecialMap;	// 特殊(运算符,界限符)符号表

	// 单词序列数据
	struct FTokenData
	{
		std::string Token;		// 单词符号
		int Code = 0;			// 类别码
	};

	// 词法分析数据
	struct FTokenizeData
	{
		std::string FilePath;				// 文件地址
		std::string Result;					// 最终结果
		std::string Data;					// 待分析数据
		size_t Size = 0;					// 数据长度
		size_t Index = 0;					// 当前字符索引
		char Start = ' ';					// 当前字符
		FTokenData TokenData;				// 当前符号信息
		std::stack<int> Delimiter[3];		// 界限符栈
		size_t Lines = 0;					// 词法分析当前行
		std::vector<std::string> Errors;	// 词法分析错误池
		std::vector<std::string> Warnings;	// 词法分析警告池
	} TokenizeData;

	/**
	 * 初始化各个符号表
	 */
	void Initialize()
	{
		KeywordMap["main"] = 1;
		KeywordMap["if"] = 2;
		KeywordMap["else"] = 3;
		KeywordMap["while"] = 4;
		KeywordMap["do"] = 5;
		KeywordMap["for"] = 6;
		KeywordMap["return"] = 7;

		RegularMap["letter(letter|digit)*"] = 8;
		RegularMap["digit digit*"] = 9;
		RegularMap["\"(all char)*\""] = 200;
		RegularMap["\'(all char)*\'"] = 201;
		RegularMap["_(letter|digit|$)*"] = 202;
		RegularMap["$(letter|digit|$)*"] = 203;
		RegularMap["0(B|b)(0-1)*"] = 204;
		RegularMap["0(0-7)*"] = 205;
		RegularMap["0(X|x)(0-9 a-f)*"] = 206;

		SpecialMap["+"] = 10;
		SpecialMap["-"] = 11;
		SpecialMap["*"] = 12;
		SpecialMap["/"] = 13;
		SpecialMap["%"] = 14;
		SpecialMap[">"] = 15;
		SpecialMap[">="] = 16;
		SpecialMap["<"] = 17;
		SpecialMap["<="] = 18;
		SpecialMap["=="] = 19;
		SpecialMap["!="] = 20;
		SpecialMap["="] = 21;
		SpecialMap[";"] = 22;
		SpecialMap["("] = 23;
		SpecialMap[")"] = 24;
		SpecialMap["{"] = 25;
		SpecialMap["}"] = 26;
		SpecialMap["["] = 100;
		SpecialMap["]"] = 101;
		SpecialMap[","] = 102;
		SpecialMap[":"] = 103;
		SpecialMap["++"] = 104;
		SpecialMap["--"] = 105;
		SpecialMap["&&"] = 106;
		SpecialMap["||"] = 107;
		SpecialMap["!"] = 108;
		SpecialMap["&"] = 109;
		SpecialMap["|"] = 110;
		SpecialMap["^"] = 111;
		SpecialMap["~"] = 112;
		SpecialMap["<<"] = 113;
		SpecialMap[">>"] = 114;
		SpecialMap["+="] = 115;
		SpecialMap["-="] = 116;
		SpecialMap["*="] = 117;
		SpecialMap["/="] = 118;
		SpecialMap["%="] = 119;
		SpecialMap["<<="] = 120;
		SpecialMap[">>="] = 121;
		SpecialMap["&="] = 122;
		SpecialMap["|="] = 123;
		SpecialMap["^="] = 124;
		SpecialMap["?"] = 125;
		SpecialMap["->"] = 126;
		SpecialMap["."] = 127;

		KeywordMap["int"] = 27;
		KeywordMap["float"] = 28;
		KeywordMap["double"] = 29;
		KeywordMap["char"] = 30;
		KeywordMap["void"] = 31;
		KeywordMap["unsigned"] = 32;
		KeywordMap["long"] = 33;
		KeywordMap["const"] = 34;
		KeywordMap["continue"] = 35;
		KeywordMap["break"] = 36;
		KeywordMap["enum"] = 37;
		KeywordMap["switch"] = 38;
		KeywordMap["case"] = 39;
		KeywordMap["static"] = 40;
		KeywordMap["auto"] = 41;
		KeywordMap["short"] = 42;
		KeywordMap["signed"] = 43;
		KeywordMap["struct"] = 44;
		KeywordMap["union"] = 45;
		KeywordMap["goto"] = 46;
		KeywordMap["default"] = 47;
		KeywordMap["extern"] = 48;
		KeywordMap["register"] = 49;
		KeywordMap["volatile"] = 50;
		KeywordMap["sizeof"] = 51;
		KeywordMap["typedef"] = 52;
		KeywordMap["#"] = 53;

		std::cout << "///\n";
	}

	/**
	 * 数据预处理
	 * 主要处理制表符('\t')
	 * 空格符将在词法分析时自动进行忽略
	 * 换行符将在词法分析时作为报错的依据
	 */
	std::string Preprocess(const std::string& Data)
	{
		std::string Result;
		const size_t Size = Data.size();
		size_t Lines = 0;
		for (size_t i = 0; i < Size; ++i)
		{
			if (Data[i] == '"')
			{
				Result += Data[i];
				++i;
				while (i < Size && Data[i] != '"')
				{
					if (Data[i] == '\\')
					{
						Result += Data[i];
						++i;
					}
					if (i < Size)
					{
						Result += Data[i];
						++i;
					}
				}
				if (Data[i] == '"')
				{
					Result += Data[i];
				}
			}
			else if (Data[i] == '\'')
			{
				Result += Data[i];
				++i;
				while (i < Size && Data[i] != '\'')
				{
					if (Data[i] == '\\')
					{
						Result += Data[i];
						++i;
					}
					Result += Data[i];
					++i;
				}
				if (Data[i] == '\'')
				{
					Result += Data[i];
				}
			}
			else if (Data[i] == '/' && (i + 1) < Size && Data[i + 1] == '/')// 处理单行注释
			{
				while (i < Size && Data[i] != '\n')
				{
					++i;
				}
				if (Data[i] == '\n')
				{
					++Lines;
					Result += Data[i];
				}
			}
			else if (Data[i] == '/' && (i + 1) < Size && Data[i + 1] == '*')// 处理多行注释
			{
				i = i + 2;
				bool IsFound = false;
				while (i < Size)
				{
					if (Data[i] == '\n')
					{
						++Lines;
					}
					if (Data[i] == '*' && (i + 1) < Size && Data[i + 1] == '/')
					{
						IsFound = true;
						i = i + 2;
						break;
					}
					++i;
				}
				if (Data[i] == '\n')
				{
					++Lines;
				}
				if (!IsFound)
				{
					TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(Lines) + " " + "no matching '*/' symbol was found.");
				}
			}
			else if (Data[i] == '\n')
			{
				++Lines;
				Result += Data[i];
			}
			else if (Data[i] == '\t')// 处理制表符
			{
				Result += ' ';
			}
			else
			{
				Result += Data[i];
			}
		}
		return Result;
	}

	/**
	 * 获取下一个有效字符,忽略空格符
	 */
	bool GetValidChar()
	{
		bool Result = false;
		while (TokenizeData.Index < TokenizeData.Size)
		{
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start != ' ')
			{
				Result = true;
				break;
			}
		}
		return Result;
	}

	/**
	 * 识别以字母为开始符号的
	 * 1. 关键字-letter(letter)*
	 * 2. 识别标识符-letter(letter|digit)*
	 */
	void TokenizeAlpha()
	{
		do
		{
			TokenizeData.TokenData.Token += TokenizeData.Start;
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
		}
		while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$'|| TokenizeData.Start == '_');
		const auto Found = KeywordMap.find(TokenizeData.TokenData.Token);
		if (Found != KeywordMap.end())
		{
			TokenizeData.TokenData.Code = Found->second;
		}
		else
		{
			TokenizeData.TokenData.Code = RegularMap["letter(letter|digit)*"];
		}
		TokenizeData.Index--;
	}

	/**
	 * 识别以数字为开始符号的
	 * 1. 无符号十进制整型常量-digit digit*
	 */
	void TokenizeDigit()
	{
		if ('1' <= TokenizeData.Start && TokenizeData.Start <= '9')
		{
			do
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			while (IsDigit(TokenizeData.Start));
			TokenizeData.TokenData.Code = RegularMap["digit digit*"];
			TokenizeData.Index--;
		}
		else
		{
			TokenizeData.TokenData.Token += TokenizeData.Start;
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == 'B' || TokenizeData.Start == 'b')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				do
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				}
				while (TokenizeData.Start == 0 || TokenizeData.Start == 1);
				TokenizeData.TokenData.Code = RegularMap["0(B|b)(0-1)*"];
				TokenizeData.Index--;
			}
			else if (TokenizeData.Start == 'X' || TokenizeData.Start == 'x')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				do
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				}
				while (IsDigit(TokenizeData.Start) || ('a' <= TokenizeData.Start and TokenizeData.Start <= 'f') || ('A' <= TokenizeData.Start and TokenizeData.Start <= 'F'));
				TokenizeData.TokenData.Code = RegularMap["0(X|x)(0-9 a-f)*"];
				TokenizeData.Index--;
			}
			else if ('0' <= TokenizeData.Start and TokenizeData.Start <= '7')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				do
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				}
				while ('0' <= TokenizeData.Start and TokenizeData.Start <= '7');
				TokenizeData.TokenData.Code = RegularMap["0(0-7)*"];
				TokenizeData.Index--;
			}
			else
			{
				TokenizeData.TokenData.Code = RegularMap["digit digit*"];
				TokenizeData.Index--;
			}
		}
	}

	/**
	 * 识别以非数字非字母为开始符号的
	 * 1. 运算符
	 * 2. 界限符
	 */
	void TokenizeSpecial()
	{
		for (auto& Special : SpecialMap)
		{
			if (TokenizeData.Start == Special.first[0])
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = Special.second;
				break;
			}
		}
		if (TokenizeData.Start == '(')
		{
			TokenizeData.Delimiter[0].push(TokenizeData.Start);
		}
		if (TokenizeData.Start == '[')
		{
			TokenizeData.Delimiter[1].push(TokenizeData.Start);
		}
		else if (TokenizeData.Start == '{')
		{
			TokenizeData.Delimiter[2].push(TokenizeData.Start);
		}
		else if (TokenizeData.Start == ')')
		{
			if (!TokenizeData.Delimiter[0].empty())
			{
				TokenizeData.Delimiter[0].pop();
			}
			else
			{
				TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched ')' symbol.");
			}
		}
		else if (TokenizeData.Start == ']')
		{
			if (!TokenizeData.Delimiter[1].empty())
			{
				TokenizeData.Delimiter[1].pop();
			}
			else
			{
				TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched ']' symbol.");
			}
		}
		else if (TokenizeData.Start == '}')
		{
			if (!TokenizeData.Delimiter[2].empty())
			{
				TokenizeData.Delimiter[2].pop();
			}
			else
			{
				TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an unmatched '}' symbol.");
			}
		}
		else if (TokenizeData.Start == '!')
		{
			TokenizeData.TokenData.Code = SpecialMap["!"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["!="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '=')
		{
			TokenizeData.TokenData.Code = SpecialMap["="];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["=="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '<')
		{
			TokenizeData.TokenData.Code = SpecialMap["<"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["<="];
			}
			else if (TokenizeData.Start == '<')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["<<"];
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				if (TokenizeData.Start == '=')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.TokenData.Code = SpecialMap["<<="];
				}
				else
				{
					TokenizeData.Index--;
				}
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '>')
		{
			TokenizeData.TokenData.Code = SpecialMap[">"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap[">="];
			}
			else if (TokenizeData.Start == '>')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap[">>"];
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				if (TokenizeData.Start == '=')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.TokenData.Code = SpecialMap[">>="];
				}
				else
				{
					TokenizeData.Index--;
				}
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '+')
		{
			TokenizeData.TokenData.Code = SpecialMap["+"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["+="];
			}
			else if (TokenizeData.Start == '+')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["++"];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '-')
		{
			TokenizeData.TokenData.Code = SpecialMap["-"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["-="];
			}
			else if (TokenizeData.Start == '-')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["--"];
			}
			else if (TokenizeData.Start == '>')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["->"];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '*')
		{
			TokenizeData.TokenData.Code = SpecialMap["*"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["*="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '/')
		{
			TokenizeData.TokenData.Code = SpecialMap["/"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["/="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '%')
		{
			TokenizeData.TokenData.Code = SpecialMap["%"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["%="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '&')
		{
			TokenizeData.TokenData.Code = SpecialMap["&"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["&="];
			}
			else if (TokenizeData.Start == '&')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["&&"];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '|')
		{
			TokenizeData.TokenData.Code = SpecialMap["|"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["|="];
			}
			else if (TokenizeData.Start == '|')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["||"];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '^')
		{
			TokenizeData.TokenData.Code = SpecialMap["^"];
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			if (TokenizeData.Start == '=')
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.TokenData.Code = SpecialMap["^="];
			}
			else
			{
				TokenizeData.Index--;
			}
		}
		else if (TokenizeData.Start == '"')
		{
			TokenizeData.TokenData.Code = RegularMap["\"(all char)*\""];
			TokenizeData.TokenData.Token += TokenizeData.Start;
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			while (true)
			{
				if (TokenizeData.Start == '"')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
					break;
				}
				if (TokenizeData.Start == '\\')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				}
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			TokenizeData.Index--;
		}
		else if (TokenizeData.Start == '\'')
		{
			TokenizeData.TokenData.Code = RegularMap["\'(all char)*\'"];
			TokenizeData.TokenData.Token += TokenizeData.Start;
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			while (true)
			{
				if (TokenizeData.Start == '\'')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
					break;
				}
				if (TokenizeData.Start == '\\')
				{
					TokenizeData.TokenData.Token += TokenizeData.Start;
					TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
				}
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			TokenizeData.Index--;
		}
		else if (TokenizeData.Start == '_')
		{
			TokenizeData.TokenData.Code = RegularMap["_(letter|digit|$)*"];
			do
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$' || TokenizeData.Start == '_');
			TokenizeData.Index--;
		}
		else if (TokenizeData.Start == '$')
		{
			TokenizeData.TokenData.Code = RegularMap["$(letter|digit|$)*"];
			do
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			while (IsLetter(TokenizeData.Start) || IsDigit(TokenizeData.Start) || TokenizeData.Start == '$' || TokenizeData.Start == '_');
			TokenizeData.Index--;
		}
		else if (TokenizeData.Start == '#')
		{
			TokenizeData.TokenData.Code = KeywordMap["#"];
			TokenizeData.TokenData.Token += TokenizeData.Start;
			TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			while (IsLetter(TokenizeData.Start))
			{
				TokenizeData.TokenData.Token += TokenizeData.Start;
				TokenizeData.Start = TokenizeData.Data[TokenizeData.Index++];
			}
			TokenizeData.Index--;
		}
		if (TokenizeData.TokenData.Code == 0)
		{
			TokenizeData.TokenData.Token = TokenizeData.Start;
			TokenizeData.Errors.push_back("ERR: " + TokenizeData.FilePath + " " + std::to_string(TokenizeData.Lines) + " there is an undefined " +
				TokenizeData.TokenData.Token + " character whose ascii code is " + std::to_string(static_cast<int>(TokenizeData.TokenData.Token[0])) + ".");
		}
	}

	/**
	 * 单词分析
	 */
	std::string Tokenize(const std::string& Data)
	{
		TokenizeData.Data = Data;
		TokenizeData.Index = 0;
		TokenizeData.Size = Data.size();

		while (GetValidChar())
		{
			TokenizeData.TokenData = {"", 0};

			if (TokenizeData.Start == '\n')
			{
				TokenizeData.Lines++;
			}
			else if (IsLetter(TokenizeData.Start))
			{
				TokenizeAlpha();
			}
			else if (IsDigit(TokenizeData.Start))
			{
				TokenizeDigit();
			}
			else
			{
				TokenizeSpecial();
			}
			if (!TokenizeData.TokenData.Token.empty())
			{
				TokenizeData.Result += "(" + TokenizeData.TokenData.Token + "," + std::to_string(TokenizeData.TokenData.Code) + ")" + "\n";
			}
		}
		return TokenizeData.Result;
	}

	/** 打印彩色字
	 * 0=黑色	1=蓝色	2=绿色	3=湖蓝色
	 * 4=红色	5=紫色	6=黄色   7=白色
	 * 8=灰色   9=淡蓝色	10=淡绿色 11=淡浅绿色
	 * 12=淡红色 13=淡紫色 14=淡黄色 15=亮白色
	 * @param ForeColor 字体颜色
	 * @param BackColor 字体背景颜色
	*/
	static void SetFontColor(int ForeColor, int BackColor)
	{
		SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), static_cast<WORD>(ForeColor + BackColor * 0x10));
	}

	//模板一下,方便调用,T表示任何可以被cout输出的类型
	template <typename T>
	static void CoutWithColor(T t, int ForeColor = 7, int BackColor = 0)
	{
		SetFontColor(ForeColor, BackColor);
		std::cout << t;
		SetFontColor(7, 0);
		Sleep(4);
	}

	/**
	 * 识别单词高亮
	 */
	static void Highlight(const std::string& SourceCode, const std::string& TokenData)
	{
		const size_t CodeSize = SourceCode.size();
		const size_t ContentSize = TokenData.size();
		size_t Index = 0;
		size_t Lines = 0;
		CoutWithColor(std::to_string(Lines) + "\t", 7);
		++Lines;
		for (size_t i = 0; i < CodeSize; ++i)
		{
			if (SourceCode[i] == '\n')
			{
				CoutWithColor(SourceCode[i] + std::to_string(Lines) + "\t", 7);
				++Lines;
			}
			else if (SourceCode[i] == ' ' || SourceCode[i] == '\t')
			{
				CoutWithColor(SourceCode[i], 7);
			}
			else if (SourceCode[i] == '/' && (i + 1) < CodeSize && SourceCode[i + 1] == '/')
			{
				std::string Temp;
				while (i < CodeSize && SourceCode[i] != '\n')
				{
					Temp += SourceCode[i++];
				}
				i--;
				CoutWithColor(Temp, 2);
			}
			else if (SourceCode[i] == '/' && (i + 1) < CodeSize && SourceCode[i + 1] == '*')
			{
				std::string Temp;
				Temp += SourceCode[i];
				Temp += SourceCode[i + 1];
				i = i + 2;
				while (i < CodeSize)
				{
					if (SourceCode[i] == '\n')
					{
						CoutWithColor(Temp, 2);
						CoutWithColor(SourceCode[i] + std::to_string(Lines) + "\t", 7);
						++Lines;
						++i;
						Temp = "";
					}
					else if (SourceCode[i] == '*' && (i + 1) < CodeSize && SourceCode[i + 1] == '/')
					{
						Temp += SourceCode[i++];
						Temp += SourceCode[i++];
						break;
					}
					else
					{
						Temp += SourceCode[i++];
					}
				}
				CoutWithColor(Temp, 2);
				i--;
			}
			else
			{
				std::string Doc;
				while (Index < ContentSize && TokenData[Index] != '\n')
				{
					Doc += TokenData[Index++];
				}
				if (TokenData[Index] == '\n')
				{
					Index++;
				}
				size_t Mid = 0;
				std::string Token;
				int Code = 0;

				// 寻找 , 分隔符
				for (size_t Pos = Doc.size() - 1; Pos != 0; --Pos)
				{
					if (Doc[Pos] != ',')
					{
						Mid = Pos - 1;
					}
					else
					{
						break;
					}
				}
				// 截取 Token
				for (size_t Start = 1; Start < Mid; ++Start)
				{
					Token += Doc[Start];
				}
				// 截取 Code
				for (size_t Start = Mid + 1; Start < (Doc.size() - 1); ++Start)
				{
					Code = Code * 10 + (Doc[Start] - '0');
				}

				// 判断类型颜色
				int ForeColor = 15;
				if ((10 <= Code and Code <= 26) or (100 <= Code and Code <= 127)) // 运算符和界限符
				{
					ForeColor = 6;
				}
				else if ((1 <= Code and Code <= 7) or (27 <= Code and Code <= 53)) // 关键字
				{
					ForeColor = 9;
				}
				else if (Code == 9 or (204 <= Code and Code <= 206)) // 无符号二,八,十,十六进制整型常数
				{
					ForeColor = 12;
				}
				else if (Code == 8 or Code == 202 or Code == 203) // 标识符
				{
					ForeColor = 10;
				}
				else if (Code == 200 or Code == 201) // 字符串常量
				{
					ForeColor = 14;
				}
				CoutWithColor(Token, ForeColor);
				i += Token.size() - 1;
			}
		}
		CoutWithColor("\n", 7);
	}

	/**
	 * 显示词法分析结果
	 */
	void ShowResult() const
	{
		std::cout << "///\n";
		CoutWithColor("VER: Copyright moota, private. all rights reserved.\n", 7);
		CoutWithColor("INF: Lexical analysis is finished, " + std::to_string(TokenizeData.Warnings.size()) + " warnings, " +
					  std::to_string(TokenizeData.Errors.size()) + " errors.\n", 14);
		for (auto& Info : TokenizeData.Warnings)
		{
			CoutWithColor(Info + "\n", 9);
		}
		for (auto& Info : TokenizeData.Errors)
		{
			CoutWithColor(Info + "\n", 12);
		}
	}

public:
	/**
	 * 调试词法分析器
	 */
	void Main()
	{
		// 初始词法分析器
		Initialize();

		// 读取处理文件
		const std::string Doc = ReadFile("Test.c");
		SaveFile(Doc, "TestRead.txt");

		// 预处理读入数据
		const std::string PreDoc = Preprocess(Doc);
		SaveFile(PreDoc, "TestPre.txt");

		// 识别单词符号
		const std::string Token = Tokenize(PreDoc);
		SaveFile(Token, "TestToken.txt");

		// 识别单词高亮
		Highlight(Doc, Token);

		// 显示词法分析结果
		ShowResult();

		getchar();
	}
};

int main()
{
	FLexicalAnalyzer LexicalAnalyzer;
	LexicalAnalyzer.Main();
	return 0;
}

  • 4
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值