作者:陈曦
日期:2012-6-16 10:21:31
环境:[Mac 10.7.1 Lion Intel-based x64 gcc4.2.1 xcode4.2]
转载请注明出处
Q: 对于编译过程的词法分析,到底应该使用什么方式?
A: 可以肯定的是,必然要对需要处理的数据挨个字符判断,然后在恰当的位置截断,得到一个个的token.
Q: 为什么得挨个字符都判断?
A: 因为编码采用源代码的方式,你无法判断程序员下一个字符是什么。比如int i;和int1i; 这两种语句显然含有不同的符号。
Q: 如何进行词法分析?
A: 一种很简单的思路就是,用一个状态保存在处理到各个字符时的状态,比如是标识符或者数字或者空格等等,直到状态改变到可以认定是不同token的时候结束。
Q: 给个设计图吧。
A: 现在不用看设计图,它来源于如下的实践。
cur_state = STATE_UNKNOWN;
state = STATE_START;
if(isdigit(*buf++))
{
state = STATE_NUM;
continue;
}
if(isblank(*buf++) && state == STATE_NUM)
{
cur_state = STATE_NUM;
state = STATE_BEGIN;
continue;
}
其实,这里的核心在于将不同符号对应的字符给区别开,在一个字符无法表达此符号时将它截断,token形成。
Q: 是否在token类型变多的情形下,上面的代码将变得很复杂?
A: 是的。token类型很多,截断token的可能和条件将变多,必然要进行恰当处理才能正确截断token,这是个内部复杂但不难的过程,可能需要分离复杂到子模块中。
Q: 可以尝试写代码了吧。
A: 是的。如下是对上面描述的状态机的结构定义:
typedef struct
{
char *buf;
char *begin;
char *cur;
char *end;
Lex_state state;
Lex_sub_state sub_state;
}Token_state;
对于Lex_state和Lex_sub_state的定义如下:
typedef enum
{
Lex_state_begin,
Lex_state_id,
Lex_state_literal_num,
Lex_state_literal_char,
Lex_state_literal_str,
Lex_state_op,
Lex_state_end,
Lex_state_err
}Lex_state;
typedef enum
{
Lex_sub_state_begin,
Lex_sub_state_underline,
Lex_sub_state_alpha,
Lex_sub_state_dec_num,
Lex_sub_state_oct_num,
Lex_sub_state_hex_num,
Lex_sub_state_op,
Lex_sub_state_quot,
Lex_sub_state_space,
Lex_sub_state_semi,
Lex_sub_state_literal_char_begin,
Lex_sub_state_literal_char_end,
Lex_sub_state_literal_str_begin,
Lex_sub_state_literal_str_end,
Lex_sub_state_end
}Lex_sub_state;
Q: 对于最终分析出来的token,用什么结构保存?
A: 先做个简单的结构:
typedef struct
{
char *name;
Token_type type;
}Token;
typedef struct
{
Token **p;
int size;
int capacity;
}TokensTable;
Token是单一的符号,TokensTable是数个Token的集合。对于,Token_type定义如下:
typedef enum
{
Token_type_keyword,
Token_type_var,
Token_type_literal,
Token_type_operator,
Token_type_err
}Token_type;
Q: 具体代码如何写?
A: 首先,我们得实现创建、销毁Token以及TokensTable的代码。
Token *createToken(const char *name)
{
Token *tk = (Token *)malloc(sizeof(Token));
if(!tk)
return NULL;
size_t len = strlen(name);
char *nameCp = (char *)malloc(len + 1);
if(!nameCp)
goto end;
strcpy(nameCp, name);
tk->name = nameCp;
tk->type = Token_type_err;
return tk;
end:
free(tk);
return NULL;
}
Token *createTokenByLen(const char *name, size_t len)
{
Token *tk = (Token *)malloc(sizeof(Token));
if(!tk)
return NULL;
char *nameCp = (char *)malloc(len + 1);
if(!nameCp)
goto end;
strncpy(nameCp, name, len);
tk->name = nameCp;
tk->type = Token_type_err;
return tk;
end:
free(tk);
return NULL;
}
void printTokenName(Token *tk)
{
printf("%p tokenName is %s\n", tk, tk->name);
}
void freeToken(Token *tk)
{
free(tk->name);
free(tk);
}
Q: 为什么没把token的类型当参数传进来?
A: 因为,这里为了更单一化,将token的类型放到另一个过程去实现。
Q: 对于TokensTable的实现:
TokensTable *createTokensTable(int defaultSize)
{
TokensTable *tt = (TokensTable *)malloc(sizeof(TokensTable));
if(!tt)
return NULL;
Token **p = (Token **)malloc(sizeof(Token *) * defaultSize);
if(!p)
goto err_has_malloc_tt;
tt->p = p;
tt->capacity = defaultSize;
tt->size = 0;
return tt;
err_has_malloc_tt:
free(tt);
return NULL;
}
bool addTokenToTable(TokensTable *tt, const Token *token)
{
Token *tk = (Token *)malloc(sizeof(Token));
if(!tk)
return false;
tk->name = (char *)malloc(strlen(token->name) + 1);
if(!tk->name)
goto end;
strcpy(tk->name, token->name);
tk->type = token->type;
if(tt->size < tt->capacity)
{
tt->p[tt->size] = tk;
++tt->size;
}
else // alloc a bigger memory
{
int i;
Token **temp = (Token **)malloc(sizeof(Token *) * (tt->capacity + 128));
if(!temp)
goto should_free_name;
for(i = 0; i < tt->capacity; ++i)
{
temp[i] = tt->p[i];
}
tt->capacity += 128;
free(tt->p);
tt->p = temp;
tt->p[tt->size] = tk;
++tt->size;
}
return true;
should_free_name:
free(tk->name);
end:
free(tk);
return false;
}
void removeTokenAt(TokensTable *tt, int index)
{
Token *tk = (Token *)tt->p[index];
freeToken(tk);
tt->p[index] = NULL;
}
void removeLastToken(TokensTable *tt)
{
removeTokenAt(tt, tt->size - 1);
}
void freeTokensTable(TokensTable *tt)
{
int i = 0;
for(; i < tt->size; ++i)
{
if(tt->p[i])
free(tt->p[i]);
}
free(tt->p);
free(tt);
}
void showAllTokens(TokensTable *tt)
{
int i = 0;
printf("TokensTable:%p All tokens: ", tt);
for(; i < tt->size; ++i)
{
if(tt->p[i])
{
printf("Token %d:%s \n", i + 1, ((Token *)tt->p[i])->name);
}
}
printf("\n");
}
A: 好的,现在可以实现状态机了。
Token_state *
createTokenState(char *buf);
bool
getAllTokens(Token_state *ts, TokensTable *tk);
void
freeTokenState(Token_state *ts);
Q: 实现代码:
Token_state *createTokenState(char *buf)
{
Token_state *state = (Token_state *)malloc(sizeof(Token_state));
if(!state)
return NULL;
size_t len = strlen(buf);
char *bufCp = (char *)malloc(len + 1);
if(!bufCp)
goto end;
strcpy(bufCp, buf);
state->buf = state->begin = state->cur = bufCp;
state->end = bufCp + len;
state->state = Lex_state_begin;
state->sub_state = Lex_sub_state_begin;
return state;
end:
free(state);
return NULL;
}
bool getAllTokens(Token_state *ts, TokensTable *tt)
{
cc_skip_space((const char **)&ts->cur);
while(*ts->cur)
{
if(isalpha(*ts->cur))
{
if(ts->state == Lex_state_begin || ts->state == Lex_state_id)
{
ts->state = Lex_state_id;
ts->sub_state = Lex_sub_state_alpha;
goto will_continue;
}
else if((*ts->cur >= 'a' && *ts->cur <= 'f')
|| (*ts->cur >= 'A' && *ts->cur <= 'F'))
{
if(ts->sub_state == Lex_sub_state_hex_num)
{
goto will_continue;
}
}
else
{
}
}
else if(*ts->cur == '_')
{
if(ts->state == Lex_state_begin || ts->state == Lex_state_id)
{
ts->state = Lex_state_id;
ts->sub_state = Lex_sub_state_underline;
goto will_continue;
}
}
else if(isdigit(*ts->cur))
{
if(ts->state == Lex_state_begin)
{
if(*ts->cur == '0')
{
ts->state = Lex_state_literal_num;
ts->sub_state = Lex_sub_state_oct_num;
goto will_continue;
}
else
{
ts->state = Lex_state_literal_num;
ts->sub_state = Lex_sub_state_dec_num;
goto will_continue;
}
}
else if(ts->sub_state == Lex_sub_state_dec_num)
{
ts->state = Lex_state_literal_num;
ts->sub_state = Lex_sub_state_dec_num;
goto will_continue;
}
else if(ts->sub_state == Lex_sub_state_oct_num)
{
if(*ts->cur >= '8')
{
error_process(ts, Error_type_oct_include_over_7_num);
return false;
}
else
{
ts->state = Lex_state_literal_num;
ts->sub_state = Lex_sub_state_oct_num;
goto will_continue;
}
}
else if(ts->state == Lex_state_id)
{
goto will_continue;
}
else if(ts->state == Lex_sub_state_hex_num)
{
goto will_continue;
}
else
{
}
}
else if(isspace(*ts->cur) || *ts->cur == ';' /*|| isOpeChar()*/)
{
if(*ts->cur == *ts->begin)
{
++ts->cur;
continue;
}
// save the previous token
Token *temp_tk = createTokenByLen(ts->begin, ts->cur - ts->begin);
if(!temp_tk)
{
error_process(ts, Error_type_not_enough_mem);
return false;
}
else
{
addTokenToTable(tt, temp_tk);
freeToken(temp_tk);
cc_skip_space((const char **)&ts->cur);
ts->begin = ts->cur;
continue;
}
}
will_continue:
++ts->cur;
continue;
}
if(*ts->cur == '\0')
{
// save the previous token
Token *temp_tk = createTokenByLen(ts->begin, ts->cur - ts->begin);
if(!temp_tk)
{
error_process(ts, Error_type_not_enough_mem);
return false;
}
else
{
addTokenToTable(tt, temp_tk);
freeToken(temp_tk);
}
}
return true;
}
void freeTokenState(Token_state *ts)
{
free(ts->buf);
free(ts);
}
A: 测试代码如下:
void testLexParser()
{
#if 1
// char *str = "0812";
// char *str = "012";
char *str = "int i = 12;";
Token_state *ts = createTokenState(str);
if(!ts)
{
printf("create Token_state error\n");
return;
}
TokensTable *tt = createTokensTable(1);
if(!tt)
{
printf("create TokensTable error\n");
freeTokenState(ts);
return;
}
if(getAllTokens(ts, tt))
{
//printTokenName(tt->p[0]);
showAllTokens(tt);
}
freeTokensTable(tt);
freeTokenState(ts);
#endif
}
int main(int argc, const char * argv[])
{
#if 1
testLexParser();
#endif
return 0;
}
运行结果:
TokensTable:0x252810 All tokens: Token 1:int
Token 2:i
Token 3:=
Token 4:12
Token 5:;
当然,状态机判断逻辑看起来很复杂,如果需要,可以继续封装。
cc_skip_space的代码如下:
void cc_skip_space(const char **p)
{
while (isspace(**p))
{
(*p)++;
}
}
作者:陈曦
日期:2012-6-16 10:21:31
环境:[Mac 10.7.1 Lion Intel-based x64 gcc4.2.1 xcode4.2]
转载请注明出处