因工作需要接触了一下Lex和Yacc,个人感觉挺有趣的,所以就写下来了。
Lex是Lexical的缩写,大概就可以理解为词汇提取。
Yacc是Yet another compiler compiler, 可以翻译为“还有另一个编译器的编译器”,挺拗口的,不过没关系,先不管它。
安装Lex和Yacc
Lex和Yacc是一种标准,当然会有很多的实现了,其中有2个是免费的(好像还有商业版本),那就是flex和bison。如果在ubuntu上安装(也有windows版本),会很简单:
sudo apt-get install flex bison
这样就ok了。
第一个Lex文件
首先我们直接来看一段代码:
%{
int wordCount = 0;
int spaceCount = 0;
int numberCount = 0;
%}
chars [A-Za-z\_\'\.\"]
numbers ([0-9])+
delim [" "\n\t]
whitespace {delim}+
words {chars}+
%%
{words} {wordCount++;}
{whitespace} {spaceCount++;}
{numbers} {numberCount++;}
%%
void main()
{
yylex();
printf(" Number of words: %d\n", wordCount);
printf(" Number of spaces: %d\n", spaceCount);
printf(" Number of nubmers: %d\n", numberCount);
}
int yywrap()
{
return 1;
}
简单说明一下:
1. 第一个%%之前是Lex语法的一些定义,%{ ** %}里面是c语言的一些变量定义。后面的chars,numbers等等是一些正则表达式吧。具体的意思可以查阅Lex说明。
第二段(也就是第一个%%和第二个%%之间),用来计算单词个个数,空格个数和数字个数。
第三段,main函数,这其实是个c函数。
我们创建一个lex文件,叫做test.lex。如:
编译lex
我们已经创建了一个简单的lex文件,那么现在就应该编译它了,直接用一条命令,如:
flex test.lex
发生什么了呢,看截图:
发现多了个lex.yy.c文件,这个文件是由flex test.lex生成的,它是个标准的c语言文件。内容可以看最后的附件,我们先管具体的内容。
编译C文件
我们已经有了lex.yy.c文件,现在编译它。直接用gcc好了。看截图:
编译完,发现多了个a.out,这个是个可执行文件。
使用lex
lex说穿了,基本套路就是:
1. 创建lex文件
2. 使用flex xxx.lex来生成.c文件
3. 使用c编译器(如gcc),来生成一个可执行文件。
那么如何使用呢?刚才的lex的例子就是用来计算一段数据里面有几个英文单词,几个数字,几个空格的。
我们可以这么来测试:
1. 创建一个test.txt测试文件
2. 里面放入:hello world 2016 6 10
3. 使用命令:./a.out < test.txt
如下:
ok,成功了,计算出了单词数2个,数字3个,空格5个。
总结:
简单说来,lex就是用来提取一段给定数据里面的符合lex文件定义的词汇。
附:
test.lex生成的对应c文件:
#line 3 "lex.yy.c"
#define YY_INT_ALIGNED short int
/* A lexical scanner generated by flex */
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif
/* First, we deal with platform-specific or compiler-specific issues. */
/* begin standard C headers. */
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
/* end standard C headers. */
/* flex integer type definitions */
#ifndef FLEXINT_H
#define FLEXINT_H
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
* if you want the limit (max/min) macros for int types.
*/
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS 1
#endif
#include <inttypes.h>
typedef int8_t flex_int8_t;
typedef uint8_t flex_uint8_t;
typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
/* Limits of integral types. */
#ifndef INT8_MIN
#define INT8_MIN (-128)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-2147483647-1)
#endif
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
#endif /* ! C99 */
#endif /* ! FLEXINT_H */
#ifdef __cplusplus
/* The "const" storage-class-modifier is valid. */
#define YY_USE_CONST
#else /* ! __cplusplus */
/* C99 requires __STDC__ to be defined as 1. */
#if defined (__STDC__)
#define YY_USE_CONST
#endif /* defined (__STDC__) */
#endif /* ! __cplusplus */
#ifdef YY_USE_CONST
#define yyconst const
#else
#define yyconst
#endif
/* Returned upon end-of-file. */
#define YY_NULL 0
/* Promotes a possibly negative, possibly signed char to an unsigned
* integer for use as an array index. If the signed char is negative,
* we want to instead treat it as an 8-bit unsigned char, hence the
* double cast.
*/
#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
/* Enter a start condition. This macro really ought to take a parameter,
* but we do it the disgusting crufty way forced on us by the ()-less
* definition of BEGIN.
*/
#define BEGIN (yy_start) = 1 + 2 *
/* Translate the current start state into a value that can be later handed
* to BEGIN to return to the state. The YYSTATE alias is for lex
* compatibility.
*/
#define YY_START (((yy_start) - 1) / 2)
#define YYSTATE YY_START
/* Action number for EOF rule of a given start state. */
#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
/* Special action meaning "start processing a new file". */
#define YY_NEW_FILE yyrestart(yyin )
#define YY_END_OF_BUFFER_CHAR 0
/* Size of default input buffer. */
#ifndef YY_BUF_SIZE
#ifdef __ia64__
/* On IA-64, the buffer size is 16k, not 8k.
* Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
* Ditto for the __ia64__ case accordingly.
*/
#define YY_BUF_SIZE 32768
#else
#define YY_BUF_SIZE 16384
#endif /* __ia64__ */
#endif
/* The state buf must be large enough to hold one state per character in the main buffer.
*/
#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
#define YY_TYPEDEF_YY_BUFFER_STATE
typedef struct yy_buffer_state *YY_BUFFER_STATE;
#endif
extern int yyleng;
extern FILE *yyin, *yyout;
#define EOB_ACT_CONTINUE_SCAN 0
#define EOB_ACT_END_OF_FILE 1
#define EOB_ACT_LAST_MATCH 2
#define YY_LESS_LINENO(n)
/* Return all but the first "n" matched characters back to the input stream. */
#define yyless(n) \
do \
{ \
/* Undo effects of setting up yytext. */ \
int yyless_macro_arg = (n); \
YY_LESS_LINENO(yyless_macro_arg);\
*yy_cp = (yy_hold_char); \
YY_RESTORE_YY_MORE_OFFSET \
(yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
YY_DO_BEFORE_ACTION; /* set up yytext again */ \
} \
while ( 0 )
#define unput(c) yyunput( c, (yytext_ptr) )
#ifndef YY_TYPEDEF_YY_SIZE_T
#define YY_TYPEDEF_YY_SIZE_T
typedef size_t yy_size_t;
#endif
#ifndef YY_STRUCT_YY_BUFFER_STATE
#define YY_STRUCT_YY_BUFFER_STATE
struct yy_buffer_state
{
FILE *yy_input_file;
char *yy_ch_buf; /* input buffer */
char *yy_buf_pos; /* current position in input buffer */
/* Size of input buffer in bytes, not including room for EOB
* characters.
*/
yy_size_t yy_buf_size;
/* Number of characters read into yy_ch_buf, not including EOB
* characters.
*/
int yy_n_chars;
/* Whether we "own" the buffer - i.e., we know we created it,
* and can realloc() it to grow it, and should free() it to
* delete it.
*/
int yy_is_our_buffer;
/* Whether this is an "interactive" input source; if so, and
* if we're using stdio for input, then we want to use getc()
* instead of fread(), to make sure we stop fetching input after
* each newline.
*/
int yy_is_interactive;
/* Whether we're considered to be at the beginning of a line.
* If so, '^' rules will be active on the next match, otherwise
* not.
*/
int yy_at_bol;
int yy_bs_lineno; /**< The line count. */
int yy_bs_column; /**< The column count. */
/* Whether to try to fill the input buffer when we reach the
* end of it.
*/
int yy_fill_buffer;
int yy_buffer_status;
#define YY_BUFFER_NEW 0
#define YY_BUFFER_NORMAL 1
/* When an EOF's been seen but there's still some text to process
* then we mark the buffer as YY_EOF_PENDING, to indicate that we
* shouldn't try reading from the input source any more. We might
* still have a bunch of tokens to match, though, because of
* possible backing-up.
*
* When we actually see the EOF, we change the status to "new"
* (via yyrestart()), so that the user can continue scanning by
* just pointing y