C语言的正则表达式 regex

最新推荐文章于 2022-05-28 11:38:57 发布

gaoguoxin2

最新推荐文章于 2022-05-28 11:38:57 发布

阅读量1.1k

点赞数

分类专栏：转载篇

转载篇专栏收录该内容

253 篇文章 0 订阅

订阅专栏

https://actom.me/blog/c%E8%AF%AD%E8%A8%80%E7%9A%84%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F-regex.html

正则表达式在编程中的应用是非常广泛的，在C语言中，同样有着正则表达式的库，我们使用regex.h这个头说包含的函数来完成我们的需要：
先看一段例子：

 
         1 
       
         2 
       
         3 
       
         4 
       
         5 
       
         6 
       
         7 
       
         8 
       
         9 
       
         10 
       
         11 
       
         12 
       
         13 
       
         14 
       
         15 
       
         16 
       
         17 
       
         18 
       
         19 
       
         20 
       
         21 
       
         22 
       
         23 
       
         24 
       
         25 
       
         26 
       
         27 
       
         28 
       
         29 
       
         30 
       
         31 
       
         32 
       
         33 
       
         34 
       
         35 
       
         36 
       
         37 
       
        #include <regex.h> 
       
        #include <stdio.h> 
       
        #include <string.h> 
       
        #include <stdlib.h> 
       
        char 
          
        * 
        sub_string 
        ( 
        char 
          
        * 
        str 
        , 
          
        int 
          
        start 
        , 
          
        int 
          
        end 
        ) 
       
        { 
       
        static 
          
        char 
          
        * 
          
        st 
          
        = 
          
        NULL 
        ; 
       
        int 
          
        i 
          
        = 
          
        start 
        , 
          
        j 
          
        = 
          
        0 
        ; 
       
        st 
          
        ? 
          
        free 
        ( 
        st 
        ) 
          
        : 
          
        0 
        ; 
       
        st 
          
        = 
          
        ( 
        char 
          
        * 
        ) 
        malloc 
        ( 
        sizeof 
        ( 
        char 
        ) 
          
        * 
          
        ( 
        end 
          
        - 
          
        start 
          
        + 
          
        1 
        ) 
        ) 
        ; 
       
        while 
        ( 
        i 
          
        < 
          
        end 
        ) 
        { 
       
        st 
        [ 
        j 
        ++ 
        ] 
          
        = 
          
        str 
        [ 
        i 
        ++ 
        ] 
        ; 
       
        } 
       
        st 
        [ 
        j 
        ] 
          
        = 
          
        '\0' 
        ; 
       
        return 
          
        st 
        ; 
       
        } 
       
        int 
          
        main 
        ( 
        ) 
       
        { 
       
        regmatch_t  
        pm 
        [ 
        4 
        ] 
        ; 
       
        regex_t  
        preg 
        ; 
       
        char 
          
        * 
        pattern 
          
        = 
          
        "(href\\ *=\\ *\\\" 
        ) 
        ( 
        [ 
        ^ 
        \ 
        \ 
        \ 
        "]*)(\\\" 
        ) 
        "; //匹配串 
       
        char *file = " 
        < 
        a 
          
        href 
        = 
        \" 
        http 
        : 
        //www.awaysoft.com\">Awaysoft.com</a><a href=\"http://www.awaysoft2.com\">Awaysoft2.com</a><a href=\"http://www.awaysoft3.com\">Awaysoft.com3</a>", *st; //被匹配串 
       
        if 
          
        ( 
        regcomp 
        ( 
        & 
        preg 
        , 
          
        pattern 
        , 
          
        REG_EXTENDED 
          
        | 
        REG_NEWLINE 
        ) 
          
        != 
          
        0 
        ) 
        { 
          
        //编译正则表达式 
       
        fprintf 
        ( 
        stderr 
        , 
          
        "Cannot regex compile!" 
        ) 
        ; 
       
        return 
          
        - 
        1 
        ; 
       
        } 
       
        st 
          
        = 
          
        file 
        ; 
       
        while 
          
        ( 
        st 
          
        && 
          
        regexec 
        ( 
        & 
        preg 
        , 
          
        st 
        , 
          
        4 
        , 
          
        pm 
        , 
          
        REG_NOTEOL 
        ) 
          
        != 
          
        REG_NOMATCH 
        ) 
        { 
          
        //开始匹配 
       
        printf 
        ( 
        "%s\n" 
        , 
        sub_string 
        ( 
        st 
        , 
          
        pm 
        [ 
        2 
        ] 
        . 
        rm_so 
        , 
          
        pm 
        [ 
        2 
        ] 
        . 
        rm_eo 
        ) 
        ) 
        ; 
       
        st 
          
        = 
          
        & 
        st 
        [ 
        pm 
        [ 
        3 
        ] 
        . 
        rm_eo 
        ] 
        ; 
        //转到下一个匹配的初始位置 
       
        } 
       
        return 
          
        0 
        ; 
       
        }

这段程序的运行结果是：

 
         1 
       
         2 
       
         3 
       
        http 
        : 
        //www.awaysoft.com 
       
        http 
        : 
        //www.awaysoft2.com 
       
        http 
        : 
        //www.awaysoft3.com

要注意的是regexec仅仅能进行一次匹配，要多次匹配需要用循环！
所以，我用了一个while语句，pm这个数组表示的是分组数目，不是匹配个数。看到正则表达式中，有3组()，那么就代表有3+1个分组，+1是指被匹配的全部字符串。用上面给的例子，那么pm就是4,其中pm[0] == “href=\”http://www.awaysoft.com\””,pm[1] == “href=\””,pm[2] == “http://www.awaysoft.com”, pm[3] = “\””;
st = &st[pm[3].rm_eo];这句是将指针移动到下一次匹配开始的字符。

附：
正则表达式快速入门

下面，转了一篇regex的详细用法：
编译正则表达式

为了提高效率，在将一个字符串和正则表达式进行比较之前，首先要用regcomp()函数对他进行编译，将其转化为regex_t结构：

int regcomp(regex_t *preg, const char *regex, int cflags);

参数regex是个字符串，他代表将要被编译的正则表达式；参数preg指向一个声明为regex_t的数据结构，用来保存编译结果；参数cflags决定了正则表达式该怎么被处理的细节。

如果函数regcomp()执行成功，并且编译结果被正确填充到preg中后，函数将返回0，所有其他的返回结果都代表有某种错误产生。

匹配正则表达式

一旦用regcomp()函数成功地编译了正则表达式，接下来就能调用regexec()函数完成模式匹配：

参数preg指向编译后的正则表达式，参数string是将要进行匹配的字符串，而参数nmatch和pmatch则用于把匹配结果返回给调用程式，最后一个参数eflags决定了匹配的细节。

在调用函数regexec()进行模式匹配的过程中，可能在字符串string中会有多处和给定的正则表达式相匹配，参数pmatch就是用来保存这些匹配位置的，而参数nmatch则告诉函数regexec()最多能把多少个匹配结果填充到pmatch数组中。当regexec()函数成功返回时，从string+pmatch[0].rm_so到string+pmatch[0].rm_eo是第一个匹配的字符串，而从string+pmatch[1].rm_so到string+pmatch[1].rm_eo，则是第二个匹配的字符串，依此类推。

释放正则表达式

无论什么时候，当不再需要已编译过的正则表达式时，都应该调用函数regfree()将其释放，以免产生内存泄漏。

void regfree(regex_t *preg);

函数regfree()不会返回所有结果，他仅接收一个指向regex_t数据类型的指针，这是之前调用regcomp()函数所得到的编译结果。

如果在程式中针对同一个regex_t结构调用了多次regcomp()函数，POSIX标准并没有规定是否每次都必须调用regfree()函数进行释放，但建议每次调用regcomp()函数对正则表达式进行编译后都调用一次regfree()函数，以尽早释放占用的存储空间。

报告错误信息

如果调用函数regcomp()或regexec()得到的是个非0的返回值，则表明在对正则表达式的处理过程中出现了某种错误，此时能通过调用函数regerror()得到周详的错误信息。

size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);

参数errcode是来自函数regcomp()或regexec()的错误代码，而参数preg则是由函数regcomp()得到的编译结果，其目的是把格式化消息所必须的上下文提供给regerror()函数。在执行函数regerror()时，将按照参数errbuf_size指明的最大字节数，在errbuf缓冲区中填入格式化后的错误信息，同时返回错误信息的长度。

应用正则表达式

最后给出一个具体的实例，介绍怎么在C语言程式中处理正则表达式。

 
         1 
       
         2 
       
         3 
       
         4 
       
        #include &lt;regex.h&gt; 
       
        #include &lt;stdio.h&gt; 
       
        #include &lt;string.h&gt; 
       
        #include &lt;stdlib.h&gt;

 
C
 
         1 
       
         2 
       
         3 
       
         4 
       
         5 
       
         6 
       
         7 
       
         8 
       
         9 
       
         10 
       
         11 
       
         12 
       
         13 
       
         14 
       
         15 
       
         16 
       
         17 
       
         18 
       
         19 
       
         20 
       
         21 
       
         22 
       
         23 
       
         24 
       
         25 
       
         26 
       
         27 
       
         28 
       
         29 
       
         30 
       
         31 
       
         32 
       
         33 
       
         34 
       
         35 
       
         36 
       
         37 
       
         38 
       
         39 
       
         40 
       
         41 
       
         42 
       
         43 
       
         44 
       
         45 
       
         46 
       
         47 
       
         48 
       
         49 
       
        /* 取子串的函数 */ 
       
        static 
          
        char 
        * 
          
        substr 
        ( 
        const 
          
        char 
        * 
        str 
        , 
          
        unsigned 
          
        start 
        , 
          
        unsigned 
          
        end 
        ) 
       
        { 
       
        unsigned 
          
        n 
          
        = 
          
        end 
          
        - 
          
        start 
        ; 
       
        static 
          
        char 
          
        stbuf 
        [ 
        256 
        ] 
        ; 
       
        strncpy 
        ( 
        stbuf 
        , 
          
        str 
          
        + 
          
        start 
        , 
          
        n 
        ) 
        ; 
       
        stbuf 
        [ 
        n 
        ] 
          
        = 
          
        0 
        ; 
       
        return 
          
        stbuf 
        ; 
       
        } 
       
        /* 主程式 */ 
       
        int 
          
        main 
        ( 
        int 
          
        argc 
        , 
          
        char 
        * 
        * 
          
        argv 
        ) 
       
        { 
       
        char 
          
        * 
          
        pattern 
        ; 
       
        int 
          
        x 
        , 
          
        z 
        , 
          
        lno 
          
        = 
          
        0 
        , 
          
        cflags 
          
        = 
          
        0 
        ; 
       
        char 
          
        ebuf 
        [ 
        128 
        ] 
        , 
          
        lbuf 
        [ 
        256 
        ] 
        ; 
       
        regex_t  
        reg 
        ; 
       
        regmatch_t  
        pm 
        [ 
        10 
        ] 
        ; 
       
        const 
          
        size_t  
        nmatch 
          
        = 
          
        10 
        ; 
       
        /* 编译正则表达式*/ 
       
        pattern 
          
        = 
          
        argv 
        [ 
        1 
        ] 
        ; 
       
        z 
          
        = 
          
        regcomp 
        (® 
        , 
          
        pattern 
        , 
          
        cflags 
        ) 
        ; 
       
        if 
          
        ( 
        z 
          
        != 
          
        0 
        ) 
        { 
       
        regerror 
        ( 
        z 
        , 
         ® 
        , 
          
        ebuf 
        , 
          
        sizeof 
        ( 
        ebuf 
        ) 
        ) 
        ; 
       
        fprintf 
        ( 
        stderr 
        , 
          
        "%s: pattern ’%s’ ＼n" 
        , 
          
        ebuf 
        , 
          
        pattern 
        ) 
        ; 
       
        return 
          
        1 
        ; 
       
        } 
       
        /* 逐行处理输入的数据 */ 
       
        while 
        ( 
        fgets 
        ( 
        lbuf 
        , 
          
        sizeof 
        ( 
        lbuf 
        ) 
        , 
          
        stdin 
        ) 
        ) 
          
        { 
       
        ++ 
        lno 
        ; 
       
        if 
          
        ( 
        ( 
        z 
          
        = 
          
        strlen 
        ( 
        lbuf 
        ) 
        ) 
          
        > 
          
        0 
          
        && 
          
        lbuf 
        [ 
        z 
        - 
        1 
        ] 
          
        == 
         ’＼ 
        n’ 
        ) 
       
        lbuf 
        [ 
        z 
          
        - 
          
        1 
        ] 
          
        = 
          
        0 
        ; 
       
        /* 对每一行应用正则表达式进行匹配 */ 
       
        z 
          
        = 
          
        regexec 
        (® 
        , 
          
        lbuf 
        , 
          
        nmatch 
        , 
          
        pm 
        , 
          
        0 
        ) 
        ; 
       
        if 
          
        ( 
        z 
          
        == 
          
        REG_NOMATCH 
        ) 
          
        continue 
        ; 
       
        else 
          
        if 
          
        ( 
        z 
          
        != 
          
        0 
        ) 
          
        { 
       
        regerror 
        ( 
        z 
        , 
         ® 
        , 
          
        ebuf 
        , 
          
        sizeof 
        ( 
        ebuf 
        ) 
        ) 
        ; 
       
        fprintf 
        ( 
        stderr 
        , 
          
        "%s: regcom(’%s’)＼n" 
        , 
          
        ebuf 
        , 
          
        lbuf 
        ) 
        ; 
       
        return 
          
        2 
        ; 
       
        } 
       
        /* 输出处理结果 */ 
       
        for 
          
        ( 
        x 
          
        = 
          
        0 
        ; 
          
        x 
          
        < 
          
        nmatch 
          
        && 
          
        pm 
        [ 
        x 
        ] 
        . 
        rm_so 
          
        != 
          
        - 
        1 
        ; 
          
        ++ 
          
        x 
        ) 
          
        { 
       
        if 
          
        ( 
        ! 
        x 
        ) 
          
        printf 
        ( 
        "%04d: %s＼n" 
        , 
          
        lno 
        , 
          
        lbuf 
        ) 
        ; 
       
        printf 
        ( 
        " $%d=’%s’＼n" 
        , 
          
        x 
        , 
          
        substr 
        ( 
        lbuf 
        , 
          
        pm 
        [ 
        x 
        ] 
        . 
        rm_so 
        , 
          
        pm 
        [ 
        x 
        ] 
        . 
        rm_eo 
        ) 
        ) 
        ; 
       
        } 
       
        } 
       
        /* 释放正则表达式 */ 
       
        regfree 
        (® 
        ) 
        ; 
       
        return 
          
        0 
        ; 
       
        }

上述程式负责从命令行获取正则表达式，然后将其运用于从标准输入得到的每行数据，并打印出匹配结果。执行下面的命令能编译并执行该程式：

 
         1 
       
         2 
       
         3 
       
         4 
       
         5 
       
         6 
       
         7 
       
         8 
       
        # gcc regexp.c -o regexp 
       
        # ./regexp ’regex[a-z]*’ < regexp.c 
       
        0003 
        : 
          
        #include 
       
        =’ 
        regex’ 
       
        0027 
        : 
          
        regex_t  
        reg 
        ; 
       
        =’ 
        regex’ 
       
        0054 
        : 
          
        z 
          
        = 
          
        regexec 
        (® 
        , 
          
        lbuf 
        , 
          
        nmatch 
        , 
          
        pm 
        , 
          
        0 
        ) 
        ; 
       
        =’ 
        regexec’

小结

对那些需要进行复杂数据处理的程式来说，正则表达式无疑是个非常有用的工具。本文重点在于阐述怎么在C语言中利用正则表达式来简化字符串处理，以便在数据处理方面能够获得和Perl语言类似的灵活性

my test code:

#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>

/* 取子串的函数 */
static char *substr(const char *str, unsigned start, unsigned end)
{
	unsigned n = end - start;
	static char stbuf[256];
	strncpy(stbuf, str + start, n);
	stbuf[n] = 0;
	return stbuf;
}


int main()
{
	regmatch_t pm[4];
	regex_t preg;
	char *pattern = "EINFO ([0-9A-F]{4}:){7}";	//匹配串
	char *file = "aaaEINFO FE80:0000:0000:0000:021D:1290:1234:5678 001D129012345678 21 88 FFEE", *st;	//被匹配串

	if (regcomp(&preg, pattern, REG_EXTENDED | REG_NEWLINE) != 0)
	  {			//编译正则表达式
		  fprintf(stderr, "Cannot regex compile!");
		  return -1;
	  }
	st = file;

	if(st && regexec(&preg, st, 1, pm, REG_NOTEOL) != REG_NOMATCH)
	  {			//开始匹配
		  printf("%s\n", substr(st, pm[0].rm_so, pm[0].rm_eo));
		  printf("st:%s\n", st+pm[0].rm_so);
		  printf("ed:%s\n", st+pm[0].rm_eo);
		 // st = &st[pm[3].rm_eo];	//转到下一个匹配的初始位置
	  }
	  regfree(&preg);
	return 0;
}