词法分析器部分完成。
支持:
1.支持单词分割
2.支持数字类型
3.支持字符串
4.支持换行
6.支持注释
不支持:
1.不支持关键字
2.不支持变量。
3.不支持关键字。
4.不支操作符。
偶没有被那些个编译原理课程所吓倒。。。。。真的勇士,只管前行!
#ifndef _ISTREAMTOKENIZER_H_
#define _ISTREAMTOKENIZER_H_
#include <limits.h>
#include <string>
#include <istream>
#include <vector>
#define _COUNT_OF(a) (sizeof(a)/sizeof(a[0]))
class IstreamTokenizer
{
private:
/**
* The next character to be considered by the nextToken method. May also
* be NEED_CHAR to indicate that a new character should be read, or SKIP_LF
* to indicate that a new character should be read and, if it is a '\n'
* character, it should be discarded and a second new character should be
* read.
*/
static const int SKIP_LF;
static const int NEED_CHAR;
//字符类型
static const unsigned char CT_WHITESPACE;
static const unsigned char CT_DIGIT;
static const unsigned char CT_ALPHA;
static const unsigned char CT_QUOTE;
static const unsigned char CT_COMMENT;
public:
//token类型
static const int TT_EOF;
static const int TT_EOL;
static const int TT_NUMBER;
static const int TT_WORD;
static const int TT_NOTHING;
private:
std::istream& input;
std::vector<char> buf;
int peekc;
bool pushedBack;
bool forceLower;
int LINENO;
bool eolIsSignificantP;
bool slashSlashCommentsP;
bool slashStarCommentsP;
unsigned char ctype[256];
public:
std::string sval;
double nval;
int ttype;
private:
void init()
{
wordChars('a', 'z');
wordChars('A', 'Z');
wordChars(128 + 32, 255);
whitespaceChars(0, ' ');
commentChar('/');
quoteChar('"');
quoteChar('\'');
parseNumbers();
}
public:
IstreamTokenizer(std::istream& is): input(is), peekc(NEED_CHAR)
{
init();
}
void resetSyntax()
{
for (int i = _COUNT_OF(ctype); --i >= 0;)
ctype[i] = 0;
}
void wordChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] |= CT_ALPHA;
}
void whitespaceChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] = CT_WHITESPACE;
}
void ordinaryChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] = 0;
}
void ordinaryChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = 0;
}
void commentChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = CT_COMMENT;
}
void quoteChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = CT_QUOTE;
}
void parseNumbers()
{
for (int i = '0'; i <= '9'; i++)
ctype[i] |= CT_DIGIT;
ctype['.'] |= CT_DIGIT;
ctype['-'] |= CT_DIGIT;
}
/**
* Determines whether or not ends of line are treated as tokens.
* If the flag argument is true, this tokenizer treats end of lines
* as tokens; the <code>nextToken</code> method returns
* <code>TT_EOL</code> and also sets the <code>ttype</code> field to
* this value when an end of line is read.
* <p>
* A line is a sequence of characters ending with either a
* carriage-return character (<code>'\r'</code>) or a newline
* character (<code>'\n'</code>). In addition, a carriage-return
* character followed immediately by a newline character is treated
* as a single end-of-line token.
* <p>
* If the <code>flag</code> is false, end-of-line characters are
* treated as white space and serve only to separate tokens.
*
* @param flag <code>true</code> indicates that end-of-line characters
* are separate tokens; <code>false</code> indicates that
* end-of-line characters are white space.
* @see java.io.StreamTokenizer#nextToken()
* @see java.io.StreamTokenizer#ttype
* @see java.io.StreamTokenizer#TT_EOL
*/
void eolIsSignificant(bool flag)
{
eolIsSignificantP = flag;
}
void slashStarComments(bool flag)
{
slashStarCommentsP = flag;
}
void slashSlashComments(bool flag)
{
slashSlashCommentsP = flag;
}
void lowerCaseMode(bool fl)
{
forceLower = fl;
}
/** Read the next character */
private:
int read()
{
return input.get();
}
int nextToken() {
if (pushedBack) {
pushedBack = false;
return ttype;
}
unsigned char* ct = ctype;
int c = peekc;
if (c < 0)
c = NEED_CHAR;
if (c == SKIP_LF) {
c = read();
if (c < 0)
return ttype = TT_EOF;
if (c == '\n')
c = NEED_CHAR;
}
if (c == NEED_CHAR) {
c = read();
if (c < 0)
return ttype = TT_EOF;
}
ttype = c; /* Just to be safe */
/* Set peekc so that the next invocation of nextToken will read
* another character unless peekc is reset in this invocation
*/
peekc = NEED_CHAR;
int ctype = c < 256 ? ct[c] : CT_ALPHA;
while ((ctype & CT_WHITESPACE) != 0) {
if (c == '\r') {
LINENO++;
if (eolIsSignificantP)
{
//end of line 作为结束的标识。
peekc = SKIP_LF;
return ttype = TT_EOL;
}
c = read();
if (c == '\n')
c = read();
} else {
if (c == '\n') {
LINENO++;
if (eolIsSignificantP) {
//end of line 作为结束的标识。
return ttype = TT_EOL;
}
}
c = read();
}
if (c < 0)
return ttype = TT_EOF;
ctype = c < 256 ? ct[c] : CT_ALPHA;
}
if ((ctype & CT_DIGIT) != 0) {
bool neg = false;
if (c == '-') {
c = read();
if (c != '.' && (c < '0' || c > '9')) {
peekc = c;
return ttype = '-';
}
neg = true;
}
double v = 0;
int decexp = 0;
int seendot = 0;
while (true) {
if (c == '.' && seendot == 0)
seendot = 1;
else if ('0' <= c && c <= '9') {
v = v * 10 + (c - '0');
decexp += seendot;
} else
break;
c = read();
}
peekc = c;
if (decexp != 0) {
double denom = 10;
decexp--;
while (decexp > 0) {
denom *= 10;
decexp--;
}
/* Do one division of a likely-to-be-more-accurate number */
v = v / denom;
}
nval = neg ? -v : v;
return ttype = TT_NUMBER;
}
if ((ctype & CT_ALPHA) != 0) {
int i = 0;
do {
if (i >= buf.size()) {
buf.resize(buf.size()*2);
}
buf[i++] = (char) c;
c = read();
ctype = c < 0 ? CT_WHITESPACE : c < 256 ? ct[c] : CT_ALPHA;
} while ((ctype & (CT_ALPHA | CT_DIGIT)) != 0);
peekc = c;
sval.resize(i, 0);
std::copy(buf.begin(), buf.end(), sval.begin());
return ttype = TT_WORD;
}
if ((ctype & CT_QUOTE) != 0) {
ttype = c;
int i = 0;
int d = read();
while (d >= 0 && d != ttype && d != '\n' && d != '\r')
{
if (d == '\\') {
c = read();
int first = c; /* To allow \377, but not \477 */
if (c >= '0' && c <= '7') {
c = c - '0';
int c2 = read();
if ('0' <= c2 && c2 <= '7') {
c = (c << 3) + (c2 - '0');
c2 = read();
if ('0' <= c2 && c2 <= '7' && first <= '3') {
c = (c << 3) + (c2 - '0');
d = read();
} else
d = c2;
} else
d = c2;
} else {
switch (c) {
case 'a':
c = 0x7;
break;
case 'b':
c = '\b';
break;
case 'f':
c = 0xC;
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = 0xB;
break;
}
d = read();
}
} else {
c = d;
d = read();
}
if (i >= buf.size()) {
buf.resize(buf.size()*2);
}
buf[i++] = (char)c;
}
/* If we broke out of the loop because we found a matching quote
* character then arrange to read a new character next time
* around; otherwise, save the character.
*/
peekc = (d == ttype) ? NEED_CHAR : d;
buf.resize(i);
std::copy(buf.begin(), buf.end(), sval.begin());
return ttype;
}
if (c == '/' && (slashSlashCommentsP || slashStarCommentsP)) {
c = read();
if (c == '*' && slashStarCommentsP) {
int prevc = 0;
while ((c = read()) != '/' || prevc != '*') {
if (c == '\r') {
LINENO++;
c = read();
if (c == '\n') {
c = read();
}
} else {
if (c == '\n') {
LINENO++;
c = read();
}
}
if (c < 0)
return ttype = TT_EOF;
prevc = c;
}
return nextToken();
} else if (c == '/' && slashSlashCommentsP) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
/* Now see if it is still a single line comment */
if ((ct['/'] & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
peekc = c;
return ttype = '/';
}
}
}
if ((ctype & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
}
return ttype = c;
}
void pushBack() {
if (ttype != TT_NOTHING)
pushedBack = true;
}
int lineno() {
return LINENO;
}
std::string toString();
};
const unsigned char IstreamTokenizer::CT_WHITESPACE = 1;
const unsigned char IstreamTokenizer::CT_DIGIT = 2;
const unsigned char IstreamTokenizer::CT_ALPHA = 4;
const unsigned char IstreamTokenizer::CT_QUOTE = 8;
const unsigned char IstreamTokenizer::CT_COMMENT = 16;
const int IstreamTokenizer::NEED_CHAR = INT_MAX;
const int IstreamTokenizer::SKIP_LF = INT_MAX - 1;
#endif