自己动手写C语言编译器（3）

最新推荐文章于 2023-10-28 13:17:56 发布
daojin505
最新推荐文章于 2023-10-28 13:17:56 发布
阅读量471
点赞数
本文链接：https://blog.csdn.net/daojin505/article/details/76813754
版权
词法分析器部分完成。
支持：
1.支持单词分割
2.支持数字类型
3.支持字符串
4.支持换行
6.支持注释
不支持：
1.不支持关键字
2.不支持变量。
3.不支持关键字。
4.不支操作符。
偶没有被那些个编译原理课程所吓倒。。。。。真的勇士，只管前行！
#ifndef _ISTREAMTOKENIZER_H_
#define _ISTREAMTOKENIZER_H_

#include <limits.h>
#include <string>
#include <istream>
#include <vector>
#define _COUNT_OF(a) (sizeof(a)/sizeof(a[0]))

class IstreamTokenizer 
{
    private:
		
		/**
		* The next character to be considered by the nextToken method.  May also
		* be NEED_CHAR to indicate that a new character should be read, or SKIP_LF
		* to indicate that a new character should be read and, if it is a '\n'
		* character, it should be discarded and a second new character should be
		* read.
		*/
		static const int SKIP_LF;
		static const int NEED_CHAR;

		//字符类型
		static const unsigned char CT_WHITESPACE;
		static const unsigned char CT_DIGIT;
		static const unsigned char CT_ALPHA;
		static const unsigned char CT_QUOTE;
		static const unsigned char CT_COMMENT;
	
	public:
		//token类型
		static const int TT_EOF;
		static const int TT_EOL;
		static const int TT_NUMBER;
		static const int TT_WORD;
		static const int TT_NOTHING;
	
	
	private:
		std::istream& input;

		std::vector<char> buf;

		int peekc;

        bool pushedBack;
		
		bool forceLower;

		int LINENO;
		
		bool eolIsSignificantP;
		
		bool slashSlashCommentsP;
		
		bool slashStarCommentsP;
		
		unsigned char ctype[256];

	public:
		std::string sval;

		double nval;
		
		int ttype;
	
	private:
		void init() 
		{
			wordChars('a', 'z');
			wordChars('A', 'Z');
			wordChars(128 + 32, 255);
			whitespaceChars(0, ' ');
			commentChar('/');
			quoteChar('"');
			quoteChar('\'');
			parseNumbers();
		}

    public:
		IstreamTokenizer(std::istream& is): input(is), peekc(NEED_CHAR)
		{
			init();
		}
		
		void resetSyntax() 
		{
			for (int i = _COUNT_OF(ctype); --i >= 0;)
				ctype[i] = 0;
		}
		
		
		void wordChars(int low, int hi) 
		{
			if (low < 0)
				low = 0;
			if (hi >= _COUNT_OF(ctype))
				hi = _COUNT_OF(ctype) - 1;
			while (low <= hi)
				ctype[low++] |= CT_ALPHA;
		}
		
		
		void whitespaceChars(int low, int hi) 
		{
			if (low < 0)
				low = 0;
			if (hi >= _COUNT_OF(ctype))
				hi = _COUNT_OF(ctype) - 1;
			while (low <= hi)
				ctype[low++] = CT_WHITESPACE;
		}
		
		
		void ordinaryChars(int low, int hi) 
		{
			if (low < 0)
				low = 0;
			if (hi >= _COUNT_OF(ctype))
				hi = _COUNT_OF(ctype) - 1;
			while (low <= hi)
				ctype[low++] = 0;
		}
		

		void ordinaryChar(int ch) 
		{
			if (ch >= 0 && ch < _COUNT_OF(ctype))
				ctype[ch] = 0;
		}
		
		
		void commentChar(int ch) 
		{
			if (ch >= 0 && ch < _COUNT_OF(ctype))
				ctype[ch] = CT_COMMENT;
		}
		
		
		void quoteChar(int ch) 
		{
			if (ch >= 0 && ch < _COUNT_OF(ctype))
				ctype[ch] = CT_QUOTE;
		}
		
		
		void parseNumbers() 
		{
			for (int i = '0'; i <= '9'; i++)
				ctype[i] |= CT_DIGIT;
				ctype['.'] |= CT_DIGIT;
				ctype['-'] |= CT_DIGIT;
		}
		
		/**
		* Determines whether or not ends of line are treated as tokens.
		* If the flag argument is true, this tokenizer treats end of lines
		* as tokens; the <code>nextToken</code> method returns
		* <code>TT_EOL</code> and also sets the <code>ttype</code> field to
		* this value when an end of line is read.
		* <p>
		* A line is a sequence of characters ending with either a
		* carriage-return character (<code>'&#92;r'</code>) or a newline
		* character (<code>'&#92;n'</code>). In addition, a carriage-return
		* character followed immediately by a newline character is treated
		* as a single end-of-line token.
		* <p>
		* If the <code>flag</code> is false, end-of-line characters are
		* treated as white space and serve only to separate tokens.
		*
		* @param   flag   <code>true</code> indicates that end-of-line characters
		*                 are separate tokens; <code>false</code> indicates that
		*                 end-of-line characters are white space.
		* @see     java.io.StreamTokenizer#nextToken()
		* @see     java.io.StreamTokenizer#ttype
		* @see     java.io.StreamTokenizer#TT_EOL
		*/
		void eolIsSignificant(bool flag) 
		{
			eolIsSignificantP = flag;
		}
		
		
		void slashStarComments(bool flag) 
		{
			slashStarCommentsP = flag;
		}
		
		
		void slashSlashComments(bool flag) 
		{
			slashSlashCommentsP = flag;
		}
		
		
		void lowerCaseMode(bool fl) 
		{
			forceLower = fl;
		}
		
		/** Read the next character */
		private:
			int read()  
			{
			return input.get();	
		}
		
	
		int nextToken() {
			if (pushedBack) {
				pushedBack = false;
				return ttype;
			}

			unsigned char* ct = ctype;

			int c = peekc;
			if (c < 0)
				c = NEED_CHAR;
			if (c == SKIP_LF) {
				c = read();
				if (c < 0)
					return ttype = TT_EOF;
				if (c == '\n')
					c = NEED_CHAR;
			}
			if (c == NEED_CHAR) {
				c = read();
				if (c < 0)
					return ttype = TT_EOF;
			}
			ttype = c;		/* Just to be safe */
			
							/* Set peekc so that the next invocation of nextToken will read
							* another character unless peekc is reset in this invocation
			*/
			peekc = NEED_CHAR;
			
			int ctype = c < 256 ? ct[c] : CT_ALPHA;
			while ((ctype & CT_WHITESPACE) != 0) {
				if (c == '\r') {
					LINENO++;
					if (eolIsSignificantP) 
					{
						//end of line 作为结束的标识。
						peekc = SKIP_LF;
						return ttype = TT_EOL;
					}
					c = read();
					if (c == '\n')
						c = read();
				} else {
					if (c == '\n') {
						LINENO++;
						if (eolIsSignificantP) {
							//end of line 作为结束的标识。
							return ttype = TT_EOL;
						}
					}
					c = read();
				}
				if (c < 0)
					return ttype = TT_EOF;
				ctype = c < 256 ? ct[c] : CT_ALPHA;
			}
			
			if ((ctype & CT_DIGIT) != 0) {
				bool neg = false;
				if (c == '-') {
					c = read();
					if (c != '.' && (c < '0' || c > '9')) {
						peekc = c;
						return ttype = '-';
					}
					neg = true;
				}
				double v = 0;
				int decexp = 0;
				int seendot = 0;
				while (true) {
					if (c == '.' && seendot == 0)
						seendot = 1;
					else if ('0' <= c && c <= '9') {
						v = v * 10 + (c - '0');
						decexp += seendot;
					} else
						break;
					c = read();
				}
				peekc = c;
				if (decexp != 0) {
					double denom = 10;
					decexp--;
					while (decexp > 0) {
						denom *= 10;
						decexp--;
					}
					/* Do one division of a likely-to-be-more-accurate number */
					v = v / denom;
				}
				nval = neg ? -v : v;
				return ttype = TT_NUMBER;
			}
			
			if ((ctype & CT_ALPHA) != 0) {
				int i = 0;
				do {
					if (i >= buf.size()) {
						buf.resize(buf.size()*2);
					}
					buf[i++] = (char) c;
					c = read();
					ctype = c < 0 ? CT_WHITESPACE : c < 256 ? ct[c] : CT_ALPHA;
				} while ((ctype & (CT_ALPHA | CT_DIGIT)) != 0);
				
				peekc = c;

				sval.resize(i, 0);

				std::copy(buf.begin(), buf.end(), sval.begin());

				return ttype = TT_WORD;
			}
			
			if ((ctype & CT_QUOTE) != 0) {
				ttype = c;
				int i = 0;
				int d = read();
				while (d >= 0 && d != ttype && d != '\n' && d != '\r') 
				{
					if (d == '\\') {
						c = read();
						int first = c;   /* To allow \377, but not \477 */
						if (c >= '0' && c <= '7') {
							c = c - '0';
							int c2 = read();
							if ('0' <= c2 && c2 <= '7') {
								c = (c << 3) + (c2 - '0');
								c2 = read();
								if ('0' <= c2 && c2 <= '7' && first <= '3') {
									c = (c << 3) + (c2 - '0');
									d = read();
								} else
									d = c2;
							} else
								d = c2;
						} else {
							switch (c) {
							case 'a':
								c = 0x7;
								break;
							case 'b':
								c = '\b';
								break;
							case 'f':
								c = 0xC;
								break;
							case 'n':
								c = '\n';
								break;
							case 'r':
								c = '\r';
								break;
							case 't':
								c = '\t';
								break;
							case 'v':
								c = 0xB;
								break;
							}
							d = read();
						}
					} else {
						c = d;
						d = read();
					}
					if (i >= buf.size()) {
						buf.resize(buf.size()*2);
					}
					buf[i++] = (char)c;
				}
				
				/* If we broke out of the loop because we found a matching quote
				* character then arrange to read a new character next time
				* around; otherwise, save the character.
				*/
				peekc = (d == ttype) ? NEED_CHAR : d;
				
				buf.resize(i);
				std::copy(buf.begin(), buf.end(), sval.begin());

				return ttype;
			}
			
			if (c == '/' && (slashSlashCommentsP || slashStarCommentsP)) {
				c = read();
				if (c == '*' && slashStarCommentsP) {
					int prevc = 0;
					while ((c = read()) != '/' || prevc != '*') {
						if (c == '\r') {
							LINENO++;
							c = read();
							if (c == '\n') {
								c = read();
							}
						} else {
							if (c == '\n') {
								LINENO++;
								c = read();
							}
						}
						if (c < 0)
							return ttype = TT_EOF;
						prevc = c;
					}
					return nextToken();
				} else if (c == '/' && slashSlashCommentsP) {
					while ((c = read()) != '\n' && c != '\r' && c >= 0);
					peekc = c;
					return nextToken();
				} else {
					/* Now see if it is still a single line comment */
					if ((ct['/'] & CT_COMMENT) != 0) {
						while ((c = read()) != '\n' && c != '\r' && c >= 0);
						peekc = c;
						return nextToken();
					} else {
						peekc = c;
						return ttype = '/';
					}
				}
			}
			
			if ((ctype & CT_COMMENT) != 0) {
				while ((c = read()) != '\n' && c != '\r' && c >= 0);
				peekc = c;
				return nextToken();
			}
			
			return ttype = c;
    }
	
    
     void pushBack() {
        if (ttype != TT_NOTHING)
			pushedBack = true;
    }

     int lineno() {
		return LINENO;
    }
	
     std::string toString();
	
};

const unsigned char IstreamTokenizer::CT_WHITESPACE = 1;

const unsigned char IstreamTokenizer::CT_DIGIT = 2;

const unsigned char IstreamTokenizer::CT_ALPHA = 4;

const unsigned char IstreamTokenizer::CT_QUOTE = 8;

const unsigned char IstreamTokenizer::CT_COMMENT = 16;

const int IstreamTokenizer::NEED_CHAR = INT_MAX;
 
const int IstreamTokenizer::SKIP_LF = INT_MAX - 1;

#endif