1.04 Flex实现C语言词法分析器

此处并不打算对Flex的基本用法和格式规则进行详解,有关这些内容请参考

直接上Flex代码,源码见

/*
/*
// file: 1.04_NextToken_with_Flex.y
在词法分析的时候,经常我们需要一个NextToken的函数,这个函数每一次调用返回下一个识别了的Token
本例子就是结合Flex来实现NextToken函数的基本功能,来识别C语言中的token
该词法分析器仅支持标准C98的关键字识别,新的标准增加的关键字并不支持
*/

%{
enum TokenType
{
    INTEGER                 = 1001, // 整数,123
    DECIMAL                 = 1002, // 小数,123.554
    IDENTIFIER              = 1003, // 变量,adfa
    OPERATOR                = 1004, // 操作符,+/*
    KEYWORD                 = 1005, // 关键字,if/for
    L_PARENTHESIS           = 1006, // 左圆括号,(
    R_PARENTHESIS           = 1007, // 右圆括号,)
    L_SQUARE_BRACKET        = 1008, // 左方括号,[
    R_SQUARE_BRACKET        = 1009, // 右方括号,]
    L_BRACE                 = 1010, // 左大括号,{
    R_BRACE                 = 1011, // 右大括号,}
    QUESTION_SIGN           = 1012, // 问号,?
    COMMA                   = 1013, // 逗号,,
    COLON                   = 1014, // 冒号,:
    SEMICOLON               = 1015, // 分号,;
    POUND_SIGN              = 1016, // 井号,#
    LESS_THAN               = 1017, // 小于,<
    LESS_EQUAL_THAN         = 1018, // 小于等于,<=
    GREAT_THAN              = 1019, // 大于,>
    GREAT_EQUAL_THAN        = 1020, // 大于等于,>=

    QUOTATION               = 1021, // 单引号,'

    STRING                  = 1022, // 字符串,
    ANNOTATION              = 1023, // 单行注释,"
    MULTI_LINE_ANNOTATION   = 1024, // 多行注释,"

};
int yylval = 0;
%}

%%
[*/+-]                  { return OPERATOR;} // 四则运算符 +-*/
"%"                     { return OPERATOR;} // 取模运符 %
"**"                    { return OPERATOR;} // 乘方运符 **
[~|&^]                  { return OPERATOR;}       // 位运算符 ~|&^
">>"                    { return OPERATOR;}       // 位运算符 >>
"<<"                    { return OPERATOR;}       // 位运算符 <<
"&&"                    { return OPERATOR;}       // 布尔运符 &&
"||"                    { return OPERATOR;}       // 布尔运符 ||
"!"                     { return OPERATOR;}       // 布尔运符 !
"->"                    { return OPERATOR;}       // 成员运算符号

"["                     { return L_SQUARE_BRACKET; }
"]"                     { return R_SQUARE_BRACKET; }

"("                     { return L_PARENTHESIS; }
")"                     { return R_PARENTHESIS; }

"{"                     { return L_BRACE; }
"}"                     { return R_BRACE; }

"<"                     { return LESS_THAN; }
"<="                    { return LESS_EQUAL_THAN; }
">"                     { return GREAT_THAN; }
">="                    { return GREAT_EQUAL_THAN; }

"?"                     { return QUESTION_SIGN; }
","                     { return COMMA; }
":"                     { return COLON; }
";"                     { return SEMICOLON; }
"#"                     { return POUND_SIGN; }
'                       { return QUOTATION; }


"auto"                  { return KEYWORD; }
"break"                 { return KEYWORD; }
"case"                  { return KEYWORD; }
"char"                  { return KEYWORD; }
"const"                 { return KEYWORD; }
"continue"              { return KEYWORD; }
"default"               { return KEYWORD; }
"do"                    { return KEYWORD; }
"double"                { return KEYWORD; }
"else"                  { return KEYWORD; }
"enum"                  { return KEYWORD; }
"extern"                { return KEYWORD; }
"float"                 { return KEYWORD; }
"for"                   { return KEYWORD; }
"goto"                  { return KEYWORD; }
"if"                    { return KEYWORD; }
"int"                   { return KEYWORD; }
"long"                  { return KEYWORD; }
"register"              { return KEYWORD; }
"return"                { return KEYWORD; }
"short"                 { return KEYWORD; }
"signed"                { return KEYWORD; }
"sizeof"                { return KEYWORD; }
"static"                { return KEYWORD; }
"struct"                { return KEYWORD; }
"switch"                { return KEYWORD; }
"typedef"               { return KEYWORD; }
"union"                 { return KEYWORD; }
"unsigned"              { return KEYWORD; }
"void"                  { return KEYWORD; }
"volatile"              { return KEYWORD; }
"while"                 { return KEYWORD; }


[-+]?[0-9]+                                                 { return INTEGER; } // 识别整数
[a-zA-Z_][a-zA-Z0-9_]*                                      { return IDENTIFIER; } // 识别标识符
[-+]?(([0-9]*\.?[0-9]+)|([0-9]+\.[0-9]*))(E[+-]?[0-9]+)?    { return DECIMAL; } // 识别小数,支持小数的科学计数法识别

\"[^"]*\"                                                   { return STRING; }           // 识别字符串

"//"[^\n]*\n                                               { return ANNOTATION; }   // 识别单行注释
"/*"([^*]|\*+[^/*])*"*/"                                   { return MULTI_LINE_ANNOTATION; }   // 识别多行注释

[ \t]                   { /*忽略空白字符*/ }
%%

int NextToken()
{
    /*
    每当调用yylex()时,它都会从全局输入文件yyin(默认为stdin)中扫描token。
    它一直持续到到达文件结尾(此时它返回值0)或它的其中一个动作执行return语句为止。
    */
    return yylex();
};

为了能够快速方便地管理项目的构建,创建CMakeList.txt内容如下:

cmake_minimum_required(VERSION 3.14)
project(1.04_NextToken_with_Flex)
execute_process(COMMAND
    flex -o ${PROJECT_SOURCE_DIR}/1.04_NextToken_with_Flex.h ${PROJECT_SOURCE_DIR}/1.04_NextToken_with_Flex.l)

add_executable(1.04_NextToken_with_Flex 1.04_main.c)

target_link_libraries(1.04_NextToken_with_Flex PUBLIC
    -lfl)

在外部增加了1.04_main.c文件来调用NextToken函数来返回词法分析器识别到的token。
1.04_main.c文件的内容比较简单,在这里就不列出来。请参考

自测结果:

cmp@t3600:~/work_dir/source_code/yacc_bison_practice/cmake-build-debug/ch1$ ./1.04_NextToken_with_Flex 
+-*/%?!  << >> |&^~ && || [](){}<> , : ; # ' " -> > >= <= < ?,:;#'" 123 .0 3.  3.E1 2E4 2E3 0 -1 -2 -1.1 -1.E-1 9 9_21 if auto break    case    char    const   continue    default do double   else    enum    extern  float for   goto    if int  long    register    return  short   signed  sizeof  static struct   switch  typedef union   unsigned    void volatile   while -> > >= <= < ?,:;#' **/*88888*/*//23r/fg
OPERATOR +
OPERATOR -
OPERATOR *
OPERATOR /
OPERATOR %
QUESTION_SIGN ?
OPERATOR !
OPERATOR <<
OPERATOR >>
OPERATOR |
OPERATOR &
OPERATOR ^
OPERATOR ~
OPERATOR &&
OPERATOR ||
L_SQUARE_BRACKET [
R_SQUARE_BRACKET ]
L_PARENTHESIS (
R_PARENTHESIS )
L_BRACE {
R_BRACE }
LESS_THAN <
GREAT_THAN >
COMMA ,
COLON :
SEMICOLON ;
POUND_SIGN #
QUOTATION '
STRING " -> > >= <= < ?,:;#'"
INTEGER 123
DECIMAL 0.000000
DECIMAL 3.000000
DECIMAL 30.000000
DECIMAL 20000.000000
DECIMAL 2000.000000
INTEGER 0
INTEGER -1
INTEGER -2
DECIMAL -1.100000
DECIMAL -0.100000
INTEGER 9
INTEGER 9
IDENTIFIER _21
KEYWORD if
KEYWORD auto
KEYWORD break
KEYWORD case
KEYWORD char
KEYWORD const
KEYWORD continue
KEYWORD default
KEYWORD do
KEYWORD double
KEYWORD else
KEYWORD enum
KEYWORD extern
KEYWORD float
KEYWORD for
KEYWORD goto
KEYWORD if
KEYWORD int
KEYWORD long
KEYWORD register
KEYWORD return
KEYWORD short
KEYWORD signed
KEYWORD sizeof
KEYWORD static
KEYWORD struct
KEYWORD switch
KEYWORD typedef
KEYWORD union
KEYWORD unsigned
KEYWORD void
KEYWORD volatile
KEYWORD while
OPERATOR ->
GREAT_THAN >
GREAT_EQUAL_THAN >=
LESS_EQUAL_THAN <=
LESS_THAN <
QUESTION_SIGN ?
COMMA ,
COLON :
SEMICOLON ;
POUND_SIGN #
QUOTATION '
OPERATOR **
MULTI_LINE_ANNOTATION /*88888*/
OPERATOR *
ANNOTATION //23r/fg

注意

以上代码中

"/*"([^*]|\*+[^/*])*"*/"    { return MULTI_LINE_ANNOTATION; }   // 识别多行注释

识别C语言中多行注释是不对的,虽然能够满足大多的场景。但有可能注释会很长,而flex的记号有一定的输入缓冲的长度限制,通常是16K。为了能够满足对注释的正确识别,应该使用起始条件(start condition)方法,关于起始条件的更多信息,请查看。以下提供一种简洁的注释识别代码:

%x COMMENT // 声明起始条件
%{
%}
%%
/* 注释 */
"/*"				{ BEGIN(COMMENT); }
<COMMENT>"*/"		{ BEGIN(INITIAL); }
<COMMENT>([^*]|\n)+|.  /* 使用默认动作 */
<COMMENT><<EOF>>	{ printf("%s:%d: Untermiated comment.\",curfilename,yylineno);return 0;}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值