3_06_GLib库入门与实践_正则表达式

简介

GLib正则表达式Regex依赖PCRE库,在编译时,如果configure参数指定的是internal,则使用GLib内部自带的PCRE库,如果没有指定internal参数,则会使用系统的PCRE库。

数据结构

enum  GRegexCompileFlags  // 编译标识,
enum  GRegexMatchFlags  // 匹配标识
struct  GRegex  //正则表达式的编译形式,这是一个不透明结构体
struct  GMatchInfo //不透明结构体,内含匹配信息

函数列表

GRegex * 	g_regex_new ()
GRegex * 	g_regex_ref ()
void 	g_regex_unref ()
const gchar * 	g_regex_get_pattern ()
gint 	g_regex_get_max_backref ()
gint 	g_regex_get_capture_count ()
gboolean 	g_regex_get_has_cr_or_lf ()
gint 	g_regex_get_max_lookbehind ()
gint 	g_regex_get_string_number ()
GRegexCompileFlags 	g_regex_get_compile_flags ()
GRegexMatchFlags 	g_regex_get_match_flags ()
gchar * 	g_regex_escape_string ()
gchar * 	g_regex_escape_nul ()
gboolean 	g_regex_match_simple ()
gboolean 	g_regex_match ()
gboolean 	g_regex_match_full ()
gboolean 	g_regex_match_all ()
gboolean 	g_regex_match_all_full ()
gchar ** 	g_regex_split_simple ()
gchar ** 	g_regex_split ()
gchar ** 	g_regex_split_full ()
gchar * 	g_regex_replace ()
gchar * 	g_regex_replace_literal ()
gchar * 	g_regex_replace_eval ()
gboolean 	g_regex_check_replacement ()
GRegex * 	g_match_info_get_regex ()
const gchar * 	g_match_info_get_string ()
GMatchInfo * 	g_match_info_ref ()
void 	g_match_info_unref ()
void 	g_match_info_free ()
gboolean 	g_match_info_matches ()
gboolean 	g_match_info_next ()
gint 	g_match_info_get_match_count ()
gboolean 	g_match_info_is_partial_match ()
gchar * 	g_match_info_expand_references ()
gchar * 	g_match_info_fetch ()
gboolean 	g_match_info_fetch_pos ()
gchar * 	g_match_info_fetch_named ()
gboolean 	g_match_info_fetch_named_pos ()
gchar ** 	g_match_info_fetch_all ()

函数功能分类

// 创建
GRegex * 	g_regex_new ()

// 匹配函数
gboolean 	g_regex_match_simple ()
gboolean 	g_regex_match ()
gboolean 	g_regex_match_full ()
gboolean 	g_regex_match_all ()
gboolean 	g_regex_match_all_full ()

// 匹配信息相关函数
GRegex * 	g_match_info_get_regex ()
const gchar * 	g_match_info_get_string ()
GMatchInfo * 	g_match_info_ref ()
void 	g_match_info_unref ()
void 	g_match_info_free ()
gboolean 	g_match_info_matches ()
gboolean 	g_match_info_next ()
gint 	g_match_info_get_match_count ()
gboolean 	g_match_info_is_partial_match ()
gchar * 	g_match_info_expand_references ()
gchar * 	g_match_info_fetch ()
gboolean 	g_match_info_fetch_pos ()
gchar * 	g_match_info_fetch_named ()
gboolean 	g_match_info_fetch_named_pos ()
gchar ** 	g_match_info_fetch_all ()

// 字符串分隔
gchar ** 	g_regex_split_simple ()
gchar ** 	g_regex_split ()
gchar ** 	g_regex_split_full ()

// 字符串替换
gchar * 	g_regex_replace ()
gchar * 	g_regex_replace_literal ()
gchar * 	g_regex_replace_eval ()
gboolean 	g_regex_check_replacement ()

// 转义
gchar * 	g_regex_escape_string ()
gchar * 	g_regex_escape_nul ()

// 引用及解引用
GRegex * 	g_regex_ref ()
void 	g_regex_unref ()

// 其他
const gchar * 	g_regex_get_pattern ()
gint 	g_regex_get_max_backref ()
gint 	g_regex_get_capture_count ()
gboolean 	g_regex_get_has_cr_or_lf ()
gint 	g_regex_get_max_lookbehind ()
gint 	g_regex_get_string_number ()
GRegexCompileFlags 	g_regex_get_compile_flags ()
GRegexMatchFlags 	g_regex_get_match_flags ()

函数功能说明及综合演示

基本用法示例

本示例演示GLib提供的GRegex基本用法——数字匹配。
给定的待匹配字符串是"11aa222bb33333cccc44444dddddddd",模式串为"[0-9]+"
源码见glib_examples\glib_regex\glib_regex_basic

#include <glib.h>

int main (int argc, char** argv)
{
    GRegex* regex;
    GMatchInfo *match_info;
    GError *error = NULL;
    const gchar *str = "11aa222bb33333cccc44444dddddddd";
    const gchar *pat = "[0-9]+";

    regex = g_regex_new(pat, 0, 0, &error);
    g_regex_match(regex, str, 0, &match_info);

    while (g_match_info_matches(match_info)) {
        gchar* word = g_match_info_fetch(match_info, 0);
        g_print("%s \n",word);
        g_free(word);

        g_match_info_next(match_info, NULL);
    }

    g_match_info_free(match_info);
    g_regex_unref(regex);

  return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_basic
11
222
33333
44444
单词匹配

如果目标字符串是"the they them the",匹配串是"the",则四个单词都会被匹配到,如果我们需要精确匹配到the,则可以使用\b,该元字符可以匹配单词,表达式为"\\bthe\\b"
源码见glib_examples\glib_regex\glib_regex_basic

#include <glib.h>

int main (int argc, char** argv)
{
    GRegex* regex;
    GMatchInfo *match_info;
    GError *error = NULL;
    const gchar *str = "the name of the frog is themselves";
    const gchar *pat = "\\bthe\\b";

    regex = g_regex_new(pat, 0, 0, &error);
    g_regex_match(regex, str, 0, &match_info);

    while (g_match_info_matches(match_info)) {
        gchar* word = g_match_info_fetch(match_info, 0);
        g_print("%s \n",word);
        g_free(word);

        g_match_info_next(match_info, NULL);
    }

    g_match_info_free(match_info);
    g_regex_unref(regex);

  return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_match_word
the
the

可以看到,最后的themselves单词没有被匹配到。

后向引用(回溯引用)

有下面一些html字符串,

<h1>xx<H1>
<h2>xx<H2>
<h2>xx<h3>

如果我们用表达式<[hH][1-6]>.*?</[hH][1-6]>匹配,则像上面那种<h2><h3>的标签也可以匹配到,但我们想要的只是正确的标签。这里可以使用后向引用表达式<[hH]([1-6])>.*?</[hH]\\1>将[1-6]用一个括号括起来,然后使用\1来引用它,如果[1-6]匹配的是数字2,则后面的这个\1也只会匹配到数字2。后向引用的前提概念是表达式分组,括号内的([1-6])即是一个分组,如果有多个括号,则说明该表达式有多个分组。如果整个表达式有N个分组,我们可以使用\1、\2、\3、...、\N来引用各个分组。这就是后向引用。
GLib后向引用相关的函数如下:

// 返回GRegex中后向引用的个数
g_regex_get_max_backref

现举例说明。
源码见glib_examples\glib_regex\glib_regex_back_ref

#include <string.h>
#include <glib.h>

int main (int argc, char** argv)
{
    GRegex* regex;
    GMatchInfo *match_info;
    GError *error = NULL;
    const gchar *str = "<body><h1>Title</H1>hello:<br><h2>SubTitle1</h2>world.<H2>SubTitle2</H2>foobar.<h2>This is invalid HTML</h3></body>";
    //const gchar *pat = "<[hH][1-6]>.*?</[hH][1-6]>";
    const gchar *pat = "<[hH]([1-6])>.*?</[hH]\\1>";

    regex = g_regex_new(pat, 0, 0, &error);

    const gchar *tmp = NULL;
    tmp = g_regex_get_pattern(regex);
    if(NULL != tmp) {
        g_print("get pattern: %s \n", tmp);
    }

    gint cnt = 0;
    cnt = g_regex_get_max_backref(regex);
    g_print("back reference num is: %d \n", cnt);

    g_print("original str:\n%s\n", str);
    
    g_regex_match(regex, str, 0, &match_info);

    while (g_match_info_matches(match_info)) {
        gchar* word = g_match_info_fetch(match_info, 0);
        g_print("%s \n",word);
        g_free(word);

        g_match_info_next(match_info, NULL);
    }

    g_match_info_free(match_info);
    g_regex_unref(regex);

  return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_back_ref
get pattern: <[hH]([1-6])>.*?</[hH]\1>
back reference num is: 1
original str:
<body><h1>Title</H1>hello:<br><h2>SubTitle1</h2>world.<H2>SubTitle2</H2>foobar.<h2>This is invalid HTML</h3></body>
<h1>Title</H1>
<h2>SubTitle1</h2>
<H2>SubTitle2</H2>
match函数族–g_regex_match_simple

与g_regex_new() + g_regex_match()功能相同,但更为方便。如果模式相同,多次调用g_regex_match_simple不如调用g_regex_new一次+g_regex_match多次高效。

// 字符串是否与模式匹配
g_regex_match_simple

g_regex_match_simple函数使用非常简单,只需要传入模式串与待匹配字符串。
源码见glib_examples\glib_regex\glib_regex_match_simple

#include <glib.h>

int main (int argc, char** argv)
{
    g_print("a* %s abc \n", g_regex_match_simple("a*", "abc", 0, 0)?"match":"not match");
    g_print("[a-z]+ %s abc \n", g_regex_match_simple("[a-z]+", "abc", 0, 0)?"match":"not match");
    g_print("[a-z]+ %s ABC \n", g_regex_match_simple("[a-z]+", "ABC", 0, 0)?"match":"not match");

    return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_match_simple
a* match abc
[a-z]+ match abc
[a-z]+ not match ABC
match函数族–g_regex_match

g_regex_match是最常见的字符串模式匹配函数,下面演示其使用过程。
源码见glib_examples\glib_regex\glib_regex_match

#include <glib.h>

static void test_print_uppercase_words (const gchar *string)
{
    // Print all uppercase-only words.
    GRegex *regex;
    GMatchInfo *match_info;

    regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
    g_regex_match (regex, string, 0, &match_info);
    while (g_match_info_matches (match_info))
    {
        gchar *word = g_match_info_fetch (match_info, 0);
        g_print ("Found: %s in %s \n", word, string);
        g_free (word);
        g_match_info_next (match_info, NULL);
    }

    g_match_info_free (match_info);
    g_regex_unref (regex);
}

int main (int argc, char** argv)
{
    test_print_uppercase_words("Hello, World!");
    test_print_uppercase_words("hello, world!");
    test_print_uppercase_words("HELLO, world!");
    test_print_uppercase_words("HELLO, WORLD!");

  return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_match
Found: H in Hello, World!
Found: W in Hello, World!
Found: HELLO in HELLO, world!
Found: HELLO in HELLO, WORLD!
Found: WORLD in HELLO, WORLD!
match函数族–g_regex_match_full

g_regex_match_full也是常用的字符串模式匹配函数,和g_regex_match用法几乎一样,但多了string_len和start_position两个参数。如果string_len为-1,start_position为0,则和g_regex_match效果完全一样。
g_regex_match_full的原型为:

gboolean
g_regex_match_full (const GRegex *regex,
                    const gchar *string,
                    gssize string_len,
                    gint start_position,
                    GRegexMatchFlags match_options,
                    GMatchInfo **match_info,
                    GError **error);

其传参后运行效果如下:

string pattern string_len start_position expect1 expect2
“abcd1234efgh567xyz” “[0-9]+” -1 0 得到 “1234” “567”
“abcd1234efgh567xyz” “[0-9]+” 7 0 得到 “123”
“abcd1234efgh567xyz” “[0-9]+” -1 5 得到 “234” “567”

下面是示例函数。
源码见glib_examples\glib_regex\glib_regex_match_full

#include <glib.h>

int main (int argc, char** argv)
{
    GRegex* regex;
    GMatchInfo *match_info;
    GError *error = NULL;
    const gchar *str = "abcd1234efgh567xyz";
    const gchar *pat = "[0-9]+";

    regex = g_regex_new(pat, 0, 0, &error);

    g_regex_match_full(regex, str, -1, 5, 0, &match_info, &error);

    while (g_match_info_matches(match_info)) {
        gchar* word = g_match_info_fetch(match_info, 0);
        g_print("%s in %s \n",word, str);
        g_free(word);

        g_match_info_next(match_info, NULL);
    }

    g_match_info_free(match_info);
    g_regex_unref(regex);

  return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_match_full
234 in abcd1234efgh567xyz
567 in abcd1234efgh567xyz
分隔
//g_regex_split_simple相当于g_regex_new+g_regex_split,当模式固定但需要多次调用时,不如后者高效。
g_regex_split_simple 
g_regex_split
g_regex_split_full

示例函数:
源码见glib_examples\glib_regex\glib_regex_string_split

#include <glib.h>

static void g_regex_split_simple_test(void)
{
    int i = 0;
    gchar **resv = NULL;
    const gchar *pat = ",\\s*";
    const gchar *str = "abc, ad,,   d1, bee   , a,dfd, pp";

    resv = g_regex_split_simple(pat, str, 0, 0);

    g_print("pattern: %s\n", pat);
    g_print("ori: %s\n", str);

    while(NULL != resv[i]) {
        g_print("[%d]%s \n", i, resv[i++]);
    }

    g_strfreev(resv);
}

int main (int argc, char** argv)
{
    g_regex_split_simple_test();

    return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_string_split
pattern: ,\s*
ori: abc, ad,,   d1, bee   , a,dfd, pp
[1]abc
[2]ad
[3]
[4]d1
[5]bee
[6]a
[7]dfd
[8]pp
替换和批量替换
// 将匹配到的字符串替换为新的字符串
g_regex_replace
// 批量替换
g_regex_replace_eval

字符串替换示例:
源码见glib_examples\glib_regex\glib_regex_replace

#include <glib.h>

static void g_regex_replace_test(void)
{
    GRegex *reg;
    gchar *res;

    gchar *ori = "abcd1234ghi56xyz";
    gchar *rep = "NUM";

    reg = g_regex_new("[0-9]+", 0, 0, NULL);
    res = g_regex_replace(reg, ori, -1, 0, rep, 0, NULL);

    g_print("ori:%s \n", ori);
    g_print("res:%s \n", res);

    g_free(res);
    g_regex_unref(reg);

    return;
}

int main (int argc, char** argv)
{
    g_regex_replace_test();

    return 0;
}

运行结果:
[root@centos7_6 build]# ./glib_regex_replace
ori:abcd1234ghi56xyz
res:abcdNUMghiNUMxyz

字符串批量替换示例:
(注意:g_regex_replace_eval的返回值需要调用者显式释放。)
源码见glib_examples\glib_regex\glib_regex_replace_multiple

#include <glib.h>

static gboolean eval_cb (const GMatchInfo *info, GString *res, gpointer data)
{
    gchar *match;
    gchar *r;

    match = g_match_info_fetch (info, 0);
    r = g_hash_table_lookup ((GHashTable *)data, match);
    g_string_append (res, r);
    g_free (match);

    return FALSE;
}

static void g_regex_replace_eval_test(void)
{
    GRegex *reg;
    GHashTable *h;
    gchar *res;

    gchar *ori = "a1b2c3d74A151B12C633";

    h = g_hash_table_new (g_str_hash, g_str_equal);

    g_hash_table_insert (h, "1", "ONE");
    g_hash_table_insert (h, "2", "TWO");
    g_hash_table_insert (h, "3", "THREE");
    g_hash_table_insert (h, "4", "FOUR");

    reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
    res = g_regex_replace_eval (reg, ori, -1, 0, 0, eval_cb, h, NULL);

    g_print("ori:%s \n", ori);
    g_print("res:%s \n", res);

    g_free(res);

    g_hash_table_destroy (h);
    g_regex_unref(reg);

    return;
}

int main (int argc, char** argv)
{
    g_regex_replace_eval_test();

    return 0;
}

运行结果:

[root@centos7_6 build]# ./glib_regex_replace_multiple
ori:a1b2c3d74A151B12C633
res:aONEbTWOcTHREEd7FOURAONE5ONEBONETWOC6THREETHREE
g_match_info_xxx()相关函数
GRegex * 	g_match_info_get_regex ()
const gchar * 	g_match_info_get_string ()
GMatchInfo * 	g_match_info_ref ()
void 	g_match_info_unref ()
void 	g_match_info_free ()
gboolean 	g_match_info_matches ()
gboolean 	g_match_info_next ()
gint 	g_match_info_get_match_count ()
gboolean 	g_match_info_is_partial_match ()
gchar * 	g_match_info_expand_references ()
gchar * 	g_match_info_fetch ()
gboolean 	g_match_info_fetch_pos ()
gchar * 	g_match_info_fetch_named ()
gboolean 	g_match_info_fetch_named_pos ()
gchar ** 	g_match_info_fetch_all ()

g_match_info_xxx()相关的函数如上所示,常用的有以下几个:

// 如果模式匹配到字符串,则匹配到的字符串放在match_info中。
g_match_info_matches
// 取出模式匹配到的字符串
g_match_info_fetch
// 释放匹配到的模式信息
g_match_info_free
获取GRegex对应的模式串
// 得到GRegex对应的pattern字符串
g_regex_get_pattern

这个函数在含有转义字符的时候特别有用,转义之后传给g_regex_new,通过g_regex_get_pattern可以得到转义前的字符串,可以校对是否正确。

检测表达式是否包含回车换行

有以下特殊字符:

  • CR:Carriage Return,对应ASCII中转义字符\r,表示回车
  • LF:Linefeed,对应ASCII中转义字符\n,表示换行
  • CRLF:Carriage Return & Linefeed,\r\n,表示回车并换行

g_regex_get_has_cr_or_lf函数用来检测表达式是否包含回车和换行。
下面是本函数的使用示例片段:

static void test_explicit_crlf (void)
{
  GRegex *regex;

  regex = g_regex_new ("[\r\n]a", 0, 0, NULL);
  g_assert_cmpint (g_regex_get_has_cr_or_lf (regex), ==, TRUE);
  g_regex_unref (regex);
}
前向断言和后向断言
// 获取最长后向断言的字符个数
g_regex_get_max_lookbehind

要理解先行断言和后行断言,需要先理解正则表达式位置的概念。
所谓位置,是指字符串中(每行)第一个字符的左边、最后一个字符的右边以及相邻字符的中间。
举例来说,字符串"regular express"regular右边和express左边是有位置的,两者并不相连,这个很好理解,但第一个字符r和第二个字符e之间也有一个位置,这个位置的左边是r,右边是e。与^代表开头,$代表结尾,\b代表单词边界一样,先行断言和后行断言也只匹配某些位置,在匹配过程中,不占用字符,因此也叫零宽断言
正则表达式的先行断言和后行断言一共有4种形式:

  • (?=pattern) 零宽正向先行断言(zero-width positive lookahead assertion)
  • (?!pattern) 零宽负向先行断言(zero-width negative lookahead assertion)
  • (?<=pattern) 零宽正向后行断言(zero-width positive lookbehind assertion)
  • (?<!pattern) 零宽负向后行断言(zero-width negative lookbehind assertion)
DFA算法函数

PCRE库是一个NFA正则表达式引擎。g_regex_match_all和g_regex_match_all_full模式匹配时用的是DFA算法,这里不做讨论。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值