简介
GLib正则表达式Regex依赖PCRE库,在编译时,如果configure参数指定的是internal,则使用GLib内部自带的PCRE库,如果没有指定internal参数,则会使用系统的PCRE库。
数据结构
enum GRegexCompileFlags // 编译标识,
enum GRegexMatchFlags // 匹配标识
struct GRegex //正则表达式的编译形式,这是一个不透明结构体
struct GMatchInfo //不透明结构体,内含匹配信息
函数列表
GRegex * g_regex_new ()
GRegex * g_regex_ref ()
void g_regex_unref ()
const gchar * g_regex_get_pattern ()
gint g_regex_get_max_backref ()
gint g_regex_get_capture_count ()
gboolean g_regex_get_has_cr_or_lf ()
gint g_regex_get_max_lookbehind ()
gint g_regex_get_string_number ()
GRegexCompileFlags g_regex_get_compile_flags ()
GRegexMatchFlags g_regex_get_match_flags ()
gchar * g_regex_escape_string ()
gchar * g_regex_escape_nul ()
gboolean g_regex_match_simple ()
gboolean g_regex_match ()
gboolean g_regex_match_full ()
gboolean g_regex_match_all ()
gboolean g_regex_match_all_full ()
gchar ** g_regex_split_simple ()
gchar ** g_regex_split ()
gchar ** g_regex_split_full ()
gchar * g_regex_replace ()
gchar * g_regex_replace_literal ()
gchar * g_regex_replace_eval ()
gboolean g_regex_check_replacement ()
GRegex * g_match_info_get_regex ()
const gchar * g_match_info_get_string ()
GMatchInfo * g_match_info_ref ()
void g_match_info_unref ()
void g_match_info_free ()
gboolean g_match_info_matches ()
gboolean g_match_info_next ()
gint g_match_info_get_match_count ()
gboolean g_match_info_is_partial_match ()
gchar * g_match_info_expand_references ()
gchar * g_match_info_fetch ()
gboolean g_match_info_fetch_pos ()
gchar * g_match_info_fetch_named ()
gboolean g_match_info_fetch_named_pos ()
gchar ** g_match_info_fetch_all ()
函数功能分类
// 创建
GRegex * g_regex_new ()
// 匹配函数
gboolean g_regex_match_simple ()
gboolean g_regex_match ()
gboolean g_regex_match_full ()
gboolean g_regex_match_all ()
gboolean g_regex_match_all_full ()
// 匹配信息相关函数
GRegex * g_match_info_get_regex ()
const gchar * g_match_info_get_string ()
GMatchInfo * g_match_info_ref ()
void g_match_info_unref ()
void g_match_info_free ()
gboolean g_match_info_matches ()
gboolean g_match_info_next ()
gint g_match_info_get_match_count ()
gboolean g_match_info_is_partial_match ()
gchar * g_match_info_expand_references ()
gchar * g_match_info_fetch ()
gboolean g_match_info_fetch_pos ()
gchar * g_match_info_fetch_named ()
gboolean g_match_info_fetch_named_pos ()
gchar ** g_match_info_fetch_all ()
// 字符串分隔
gchar ** g_regex_split_simple ()
gchar ** g_regex_split ()
gchar ** g_regex_split_full ()
// 字符串替换
gchar * g_regex_replace ()
gchar * g_regex_replace_literal ()
gchar * g_regex_replace_eval ()
gboolean g_regex_check_replacement ()
// 转义
gchar * g_regex_escape_string ()
gchar * g_regex_escape_nul ()
// 引用及解引用
GRegex * g_regex_ref ()
void g_regex_unref ()
// 其他
const gchar * g_regex_get_pattern ()
gint g_regex_get_max_backref ()
gint g_regex_get_capture_count ()
gboolean g_regex_get_has_cr_or_lf ()
gint g_regex_get_max_lookbehind ()
gint g_regex_get_string_number ()
GRegexCompileFlags g_regex_get_compile_flags ()
GRegexMatchFlags g_regex_get_match_flags ()
函数功能说明及综合演示
基本用法示例
本示例演示GLib提供的GRegex基本用法——数字匹配。
给定的待匹配字符串是"11aa222bb33333cccc44444dddddddd"
,模式串为"[0-9]+"
。
源码见glib_examples\glib_regex\glib_regex_basic
#include <glib.h>
int main (int argc, char** argv)
{
GRegex* regex;
GMatchInfo *match_info;
GError *error = NULL;
const gchar *str = "11aa222bb33333cccc44444dddddddd";
const gchar *pat = "[0-9]+";
regex = g_regex_new(pat, 0, 0, &error);
g_regex_match(regex, str, 0, &match_info);
while (g_match_info_matches(match_info)) {
gchar* word = g_match_info_fetch(match_info, 0);
g_print("%s \n",word);
g_free(word);
g_match_info_next(match_info, NULL);
}
g_match_info_free(match_info);
g_regex_unref(regex);
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_basic
11
222
33333
44444
单词匹配
如果目标字符串是"the they them the"
,匹配串是"the"
,则四个单词都会被匹配到,如果我们需要精确匹配到the,则可以使用\b
,该元字符可以匹配单词,表达式为"\\bthe\\b"
。
源码见glib_examples\glib_regex\glib_regex_basic
#include <glib.h>
int main (int argc, char** argv)
{
GRegex* regex;
GMatchInfo *match_info;
GError *error = NULL;
const gchar *str = "the name of the frog is themselves";
const gchar *pat = "\\bthe\\b";
regex = g_regex_new(pat, 0, 0, &error);
g_regex_match(regex, str, 0, &match_info);
while (g_match_info_matches(match_info)) {
gchar* word = g_match_info_fetch(match_info, 0);
g_print("%s \n",word);
g_free(word);
g_match_info_next(match_info, NULL);
}
g_match_info_free(match_info);
g_regex_unref(regex);
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_match_word
the
the
可以看到,最后的themselves单词没有被匹配到。
后向引用(回溯引用)
有下面一些html字符串,
<h1>xx<H1>
<h2>xx<H2>
<h2>xx<h3>
如果我们用表达式<[hH][1-6]>.*?</[hH][1-6]>
匹配,则像上面那种<h2><h3>
的标签也可以匹配到,但我们想要的只是正确的标签。这里可以使用后向引用表达式<[hH]([1-6])>.*?</[hH]\\1>
将[1-6]用一个括号括起来,然后使用\1
来引用它,如果[1-6]匹配的是数字2,则后面的这个\1
也只会匹配到数字2。后向引用的前提概念是表达式分组,括号内的([1-6])
即是一个分组,如果有多个括号,则说明该表达式有多个分组。如果整个表达式有N个分组,我们可以使用\1、\2、\3、...、\N
来引用各个分组。这就是后向引用。
GLib后向引用相关的函数如下:
// 返回GRegex中后向引用的个数
g_regex_get_max_backref
现举例说明。
源码见glib_examples\glib_regex\glib_regex_back_ref
#include <string.h>
#include <glib.h>
int main (int argc, char** argv)
{
GRegex* regex;
GMatchInfo *match_info;
GError *error = NULL;
const gchar *str = "<body><h1>Title</H1>hello:<br><h2>SubTitle1</h2>world.<H2>SubTitle2</H2>foobar.<h2>This is invalid HTML</h3></body>";
//const gchar *pat = "<[hH][1-6]>.*?</[hH][1-6]>";
const gchar *pat = "<[hH]([1-6])>.*?</[hH]\\1>";
regex = g_regex_new(pat, 0, 0, &error);
const gchar *tmp = NULL;
tmp = g_regex_get_pattern(regex);
if(NULL != tmp) {
g_print("get pattern: %s \n", tmp);
}
gint cnt = 0;
cnt = g_regex_get_max_backref(regex);
g_print("back reference num is: %d \n", cnt);
g_print("original str:\n%s\n", str);
g_regex_match(regex, str, 0, &match_info);
while (g_match_info_matches(match_info)) {
gchar* word = g_match_info_fetch(match_info, 0);
g_print("%s \n",word);
g_free(word);
g_match_info_next(match_info, NULL);
}
g_match_info_free(match_info);
g_regex_unref(regex);
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_back_ref
get pattern: <[hH]([1-6])>.*?</[hH]\1>
back reference num is: 1
original str:
<body><h1>Title</H1>hello:<br><h2>SubTitle1</h2>world.<H2>SubTitle2</H2>foobar.<h2>This is invalid HTML</h3></body>
<h1>Title</H1>
<h2>SubTitle1</h2>
<H2>SubTitle2</H2>
match函数族–g_regex_match_simple
与g_regex_new() + g_regex_match()功能相同,但更为方便。如果模式相同,多次调用g_regex_match_simple不如调用g_regex_new一次+g_regex_match多次高效。
// 字符串是否与模式匹配
g_regex_match_simple
g_regex_match_simple函数使用非常简单,只需要传入模式串与待匹配字符串。
源码见glib_examples\glib_regex\glib_regex_match_simple
#include <glib.h>
int main (int argc, char** argv)
{
g_print("a* %s abc \n", g_regex_match_simple("a*", "abc", 0, 0)?"match":"not match");
g_print("[a-z]+ %s abc \n", g_regex_match_simple("[a-z]+", "abc", 0, 0)?"match":"not match");
g_print("[a-z]+ %s ABC \n", g_regex_match_simple("[a-z]+", "ABC", 0, 0)?"match":"not match");
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_match_simple
a* match abc
[a-z]+ match abc
[a-z]+ not match ABC
match函数族–g_regex_match
g_regex_match是最常见的字符串模式匹配函数,下面演示其使用过程。
源码见glib_examples\glib_regex\glib_regex_match
#include <glib.h>
static void test_print_uppercase_words (const gchar *string)
{
// Print all uppercase-only words.
GRegex *regex;
GMatchInfo *match_info;
regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
g_regex_match (regex, string, 0, &match_info);
while (g_match_info_matches (match_info))
{
gchar *word = g_match_info_fetch (match_info, 0);
g_print ("Found: %s in %s \n", word, string);
g_free (word);
g_match_info_next (match_info, NULL);
}
g_match_info_free (match_info);
g_regex_unref (regex);
}
int main (int argc, char** argv)
{
test_print_uppercase_words("Hello, World!");
test_print_uppercase_words("hello, world!");
test_print_uppercase_words("HELLO, world!");
test_print_uppercase_words("HELLO, WORLD!");
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_match
Found: H in Hello, World!
Found: W in Hello, World!
Found: HELLO in HELLO, world!
Found: HELLO in HELLO, WORLD!
Found: WORLD in HELLO, WORLD!
match函数族–g_regex_match_full
g_regex_match_full也是常用的字符串模式匹配函数,和g_regex_match用法几乎一样,但多了string_len和start_position两个参数。如果string_len为-1,start_position为0,则和g_regex_match效果完全一样。
g_regex_match_full的原型为:
gboolean
g_regex_match_full (const GRegex *regex,
const gchar *string,
gssize string_len,
gint start_position,
GRegexMatchFlags match_options,
GMatchInfo **match_info,
GError **error);
其传参后运行效果如下:
string pattern string_len start_position expect1 expect2
“abcd1234efgh567xyz” “[0-9]+” -1 0 得到 “1234” “567”
“abcd1234efgh567xyz” “[0-9]+” 7 0 得到 “123”
“abcd1234efgh567xyz” “[0-9]+” -1 5 得到 “234” “567”
下面是示例函数。
源码见glib_examples\glib_regex\glib_regex_match_full
#include <glib.h>
int main (int argc, char** argv)
{
GRegex* regex;
GMatchInfo *match_info;
GError *error = NULL;
const gchar *str = "abcd1234efgh567xyz";
const gchar *pat = "[0-9]+";
regex = g_regex_new(pat, 0, 0, &error);
g_regex_match_full(regex, str, -1, 5, 0, &match_info, &error);
while (g_match_info_matches(match_info)) {
gchar* word = g_match_info_fetch(match_info, 0);
g_print("%s in %s \n",word, str);
g_free(word);
g_match_info_next(match_info, NULL);
}
g_match_info_free(match_info);
g_regex_unref(regex);
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_match_full
234 in abcd1234efgh567xyz
567 in abcd1234efgh567xyz
分隔
//g_regex_split_simple相当于g_regex_new+g_regex_split,当模式固定但需要多次调用时,不如后者高效。
g_regex_split_simple
g_regex_split
g_regex_split_full
示例函数:
源码见glib_examples\glib_regex\glib_regex_string_split
#include <glib.h>
static void g_regex_split_simple_test(void)
{
int i = 0;
gchar **resv = NULL;
const gchar *pat = ",\\s*";
const gchar *str = "abc, ad,, d1, bee , a,dfd, pp";
resv = g_regex_split_simple(pat, str, 0, 0);
g_print("pattern: %s\n", pat);
g_print("ori: %s\n", str);
while(NULL != resv[i]) {
g_print("[%d]%s \n", i, resv[i++]);
}
g_strfreev(resv);
}
int main (int argc, char** argv)
{
g_regex_split_simple_test();
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_string_split
pattern: ,\s*
ori: abc, ad,, d1, bee , a,dfd, pp
[1]abc
[2]ad
[3]
[4]d1
[5]bee
[6]a
[7]dfd
[8]pp
替换和批量替换
// 将匹配到的字符串替换为新的字符串
g_regex_replace
// 批量替换
g_regex_replace_eval
字符串替换示例:
源码见glib_examples\glib_regex\glib_regex_replace
#include <glib.h>
static void g_regex_replace_test(void)
{
GRegex *reg;
gchar *res;
gchar *ori = "abcd1234ghi56xyz";
gchar *rep = "NUM";
reg = g_regex_new("[0-9]+", 0, 0, NULL);
res = g_regex_replace(reg, ori, -1, 0, rep, 0, NULL);
g_print("ori:%s \n", ori);
g_print("res:%s \n", res);
g_free(res);
g_regex_unref(reg);
return;
}
int main (int argc, char** argv)
{
g_regex_replace_test();
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_replace
ori:abcd1234ghi56xyz
res:abcdNUMghiNUMxyz
字符串批量替换示例:
(注意:g_regex_replace_eval的返回值需要调用者显式释放。)
源码见glib_examples\glib_regex\glib_regex_replace_multiple
#include <glib.h>
static gboolean eval_cb (const GMatchInfo *info, GString *res, gpointer data)
{
gchar *match;
gchar *r;
match = g_match_info_fetch (info, 0);
r = g_hash_table_lookup ((GHashTable *)data, match);
g_string_append (res, r);
g_free (match);
return FALSE;
}
static void g_regex_replace_eval_test(void)
{
GRegex *reg;
GHashTable *h;
gchar *res;
gchar *ori = "a1b2c3d74A151B12C633";
h = g_hash_table_new (g_str_hash, g_str_equal);
g_hash_table_insert (h, "1", "ONE");
g_hash_table_insert (h, "2", "TWO");
g_hash_table_insert (h, "3", "THREE");
g_hash_table_insert (h, "4", "FOUR");
reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
res = g_regex_replace_eval (reg, ori, -1, 0, 0, eval_cb, h, NULL);
g_print("ori:%s \n", ori);
g_print("res:%s \n", res);
g_free(res);
g_hash_table_destroy (h);
g_regex_unref(reg);
return;
}
int main (int argc, char** argv)
{
g_regex_replace_eval_test();
return 0;
}
运行结果:
[root@centos7_6 build]# ./glib_regex_replace_multiple
ori:a1b2c3d74A151B12C633
res:aONEbTWOcTHREEd7FOURAONE5ONEBONETWOC6THREETHREE
g_match_info_xxx()相关函数
GRegex * g_match_info_get_regex ()
const gchar * g_match_info_get_string ()
GMatchInfo * g_match_info_ref ()
void g_match_info_unref ()
void g_match_info_free ()
gboolean g_match_info_matches ()
gboolean g_match_info_next ()
gint g_match_info_get_match_count ()
gboolean g_match_info_is_partial_match ()
gchar * g_match_info_expand_references ()
gchar * g_match_info_fetch ()
gboolean g_match_info_fetch_pos ()
gchar * g_match_info_fetch_named ()
gboolean g_match_info_fetch_named_pos ()
gchar ** g_match_info_fetch_all ()
g_match_info_xxx()相关的函数如上所示,常用的有以下几个:
// 如果模式匹配到字符串,则匹配到的字符串放在match_info中。
g_match_info_matches
// 取出模式匹配到的字符串
g_match_info_fetch
// 释放匹配到的模式信息
g_match_info_free
获取GRegex对应的模式串
// 得到GRegex对应的pattern字符串
g_regex_get_pattern
这个函数在含有转义字符的时候特别有用,转义之后传给g_regex_new,通过g_regex_get_pattern可以得到转义前的字符串,可以校对是否正确。
检测表达式是否包含回车换行
有以下特殊字符:
- CR:Carriage Return,对应ASCII中转义字符\r,表示回车
- LF:Linefeed,对应ASCII中转义字符\n,表示换行
- CRLF:Carriage Return & Linefeed,\r\n,表示回车并换行
g_regex_get_has_cr_or_lf函数用来检测表达式是否包含回车和换行。
下面是本函数的使用示例片段:
static void test_explicit_crlf (void)
{
GRegex *regex;
regex = g_regex_new ("[\r\n]a", 0, 0, NULL);
g_assert_cmpint (g_regex_get_has_cr_or_lf (regex), ==, TRUE);
g_regex_unref (regex);
}
前向断言和后向断言
// 获取最长后向断言的字符个数
g_regex_get_max_lookbehind
要理解先行断言和后行断言,需要先理解正则表达式位置的概念。
所谓位置,是指字符串中(每行)第一个字符的左边、最后一个字符的右边以及相邻字符的中间。
举例来说,字符串"regular express"
,regular
右边和express
左边是有位置的,两者并不相连,这个很好理解,但第一个字符r和第二个字符e之间也有一个位置,这个位置的左边是r,右边是e。与^代表开头,$代表结尾,\b代表单词边界一样,先行断言和后行断言也只匹配某些位置,在匹配过程中,不占用字符,因此也叫零宽断言。
正则表达式的先行断言和后行断言一共有4种形式:
- (?=pattern) 零宽正向先行断言(zero-width positive lookahead assertion)
- (?!pattern) 零宽负向先行断言(zero-width negative lookahead assertion)
- (?<=pattern) 零宽正向后行断言(zero-width positive lookbehind assertion)
- (?<!pattern) 零宽负向后行断言(zero-width negative lookbehind assertion)
DFA算法函数
PCRE库是一个NFA正则表达式引擎。g_regex_match_all和g_regex_match_all_full模式匹配时用的是DFA算法,这里不做讨论。