正则表达式的匹配函数

最新推荐文章于 2021-09-25 23:11:34 发布

chudu3961

最新推荐文章于 2021-09-25 23:11:34 发布

阅读量891

点赞数

文章标签： java 数据库

原文链接：https://my.oschina.net/yonj1e/blog/879874

版权

正则表达式的匹配函数

功能：根据正则表达式匹配目标字符串，并返回匹配字符串的开始位置
   参数：
       arg1 -- 目标字符串
       arg2 -- 正则表达式
       arg3 -- 从目标字符串什么位置开始匹配
       arg4 -- 标识第几个匹配的内容（目标字符串中可能有几处可以匹配成功）

   要求：
       1. 输入参数为arg1，arg2时，arg3，arg4默认值为1
       2. 输入参数为arg1，arg2，arg3时，arg4默认为1
       3. 三种输入不同的参数在一个函数中完成

PG 有正则表达式的相关功能，可以参考PG的函数实现上述功能

PostgreSQL 正则表达式常用函数：https://my.oschina.net/yonj1e/blog/879875

大致的实现思路：

pg_proc.h 添加函数声明，myfuncs.c 实现函数，获取参数，根据参数 arg3 修改目标字符串，编译正则表达式，匹配正则表达式，输出匹配位置，释放，添加错误提示信息。

添加函数声明：

src/include/catalog/pg_proc.h

DATA(insert OID = 6672 (  regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 2 0 23 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");
DATA(insert OID = 6673 (  regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 23 "25 25 23" _null_ _null_ _null_ _null_ _null_  regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");
DATA(insert OID = 6674 (  regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 4 0 23 "25 25 23 23" _null_ _null_ _null_ _null_ _null_ regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");

实现函数：

获取参数，返回结果

#define PG_GETARG_IF_EXISTS(n, type, defval) \
	((PG_NARGS() > (n) && !PG_ARGISNULL(n)) ? PG_GETARG_##type(n) : (defval))

/* 
 * Regular expression matches the target string. 
 * author:young
 */
Datum 
regexp_match_str (PG_FUNCTION_ARGS)
{
	text *orginal_str = PG_GETARG_TEXT_P(0);
	text *text_re = PG_GETARG_TEXT_P(1);
	int32 beg = PG_GETARG_IF_EXISTS(2, INT32, 1);
	int32 num = PG_GETARG_IF_EXISTS(3, INT32, 1);

	PG_RETURN_INT32(0);
}

修改目标字符串从arg3开始匹配，没有输入是默认值是 1

    /* Modify the target string, starting from the beg position */
	char *src;
	int len;
	text *orig_str;
	char dest[len];
	
	src = text_to_cstring(orginal_str);	
	len = strlen(src);
	strncpy(dest, src + beg - 1, len - beg + 1);
	//dest[len - beg + 1] = '\0';
	orig_str = cstring_to_text(dest);

PG 中处理正则表达式常用函数及相关参数类型的声明

/*
 * the prototypes for exported functions
 */
extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
extern int	pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
extern void pg_regfree(regex_t *);

typedef unsigned int pg_wchar;

int
pg_regcomp(regex_t *re,
		   const chr *string, /* 正则表达式字符串 */
		   size_t len, /* 正则表达式字符串长度 */
		   int flags,
		   Oid collation)

int
pg_regexec(regex_t *re, /* 已经用regcomp函数编译好的正则表达式 */
		   const chr *string, /* 目标字符串 */
		   size_t len, /* 目标字符串长度 */
		   size_t search_start, /* 匹配开始位置 */
		   rm_detail_t *details, /* NULL */
		   size_t nmatch, /* 是regmatch_t结构体数组的长度 */
		   regmatch_t pmatch[], /*  regmatch_t类型的结构体数组，存放匹配文本串的位置信息 */
		   int flags)

pg_regcomp()、pg_regexec()编译匹配字符串的类型是pg_wchar，所以需要先对目标字符串，正则表达式进行类型转换（可参考PG内置的几个正则函数，regexp_matches()等）

	int orig_len;
	pg_wchar *wide_str;
	int wide_len;

	int text_re_len;
	pg_wchar *pattern;
	int pattern_len;

	/* convert string to pg_wchar form for matching */
	orig_len = VARSIZE_ANY_EXHDR(orig_str);
	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

	/* Convert pattern string to wide characters */
	text_re_len = VARSIZE_ANY_EXHDR(text_re);
	pattern = (pg_wchar *) palloc(sizeof(pg_wchar) * (text_re_len + 1));
	pattern_len = pg_mb2wchar_with_len(VARDATA_ANY(text_re), pattern, text_re_len);

编译，匹配，释放

    regex_t re;
	int cflags = REG_ADVANCED;
	//Oid collation = PG_GET_COLLATION();

	regmatch_t *pmatch;
	int pmatch_len;
	bool use_subpatterns = true;

	int start_search = 0;
	int prev_match_end = 0;
	int count = 1;

    pg_regcomp(&re, pattern, pattern_len, cflags, PG_GET_COLLATION());

    if (use_subpatterns && re.re_nsub > 0)
	{
		pmatch_len = re.re_nsub + 1;
	}
	else
	{
		use_subpatterns = false;
		pmatch_len = 1;
	}

	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

    while (pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0) == REG_OKAY)
	{
		if (num == count)
		{
			PG_RETURN_INT32(pmatch[0].rm_so + 1);
		}
		count++;
		prev_match_end = pmatch[0].rm_eo;

		start_search = prev_match_end;

		if (pmatch[0].rm_so == pmatch[0].rm_eo)
		{
			start_search++;
		}

		if (start_search > wide_len)
		{
			break;
		}
	}

    pg_regfree(&re);

添加上参数错误提示，执行 regcomp 或者 regexec 产生错误的时候，regerror返回一个包含错误信息的字符串，完整的代码如下：

/* 
 * Regular expression matches the target string. 
 * author:young
 */
Datum 
regexp_match_str (PG_FUNCTION_ARGS)
{
	text *orginal_str = PG_GETARG_TEXT_P(0);
	text *text_re = PG_GETARG_TEXT_P(1);
	int32 beg = PG_GETARG_IF_EXISTS(2, INT32, 1);
	int32 num = PG_GETARG_IF_EXISTS(3, INT32, 1);

	/* Modify the target string, starting from the beg position */
	char *src;
	int len;
	text *orig_str;
	char dest[len];
	
	src = text_to_cstring(orginal_str);	
	len = strlen(src);
	strncpy(dest, src + beg - 1, len - beg + 1);
	//dest[len - beg + 1] = '\0';
	orig_str = cstring_to_text(dest);

	if (!((1 <= beg && beg <= len) && (1 <= num)))

	{
		ereport(ERROR,
			(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
			errmsg("Parameters> = 1, such as 1.2 ....")));
	}

	int orig_len;
	pg_wchar *wide_str;
	int wide_len;

	int text_re_len;
	pg_wchar *pattern;
	int pattern_len;

	regex_t re;
	int cflags = REG_ADVANCED;
	//Oid collation = PG_GET_COLLATION();
	regmatch_t *pmatch;
	int pmatch_len;
	bool use_subpatterns = true;
	int start_search = 0;
	int prev_match_end = 0;
	int count = 1;

	/* convert string to pg_wchar form for matching */
	orig_len = VARSIZE_ANY_EXHDR(orig_str);
	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

	/* Convert pattern string to wide characters */
	text_re_len = VARSIZE_ANY_EXHDR(text_re);
	pattern = (pg_wchar *) palloc(sizeof(pg_wchar) * (text_re_len + 1));
	pattern_len = pg_mb2wchar_with_len(VARDATA_ANY(text_re), pattern, text_re_len);

	int regcomp_result;
	char errMsg[100];
	regcomp_result = pg_regcomp(&re, pattern, pattern_len, cflags, PG_GET_COLLATION());
	
	if (regcomp_result != REG_OKAY)
	{
		pg_regerror(regcomp_result, &re, errMsg, sizeof(errMsg));
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("invalid regular expression: %s", errMsg)));
	}

	if (use_subpatterns && re.re_nsub > 0)
	{
		pmatch_len = re.re_nsub + 1;
	}
	else
	{
		use_subpatterns = false;
		pmatch_len = 1;
	}

	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

	int regexec_result;
	regexec_result = pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0);

	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
	{
		pg_regerror(regexec_result, &re, errMsg, sizeof(errMsg));
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("regular expression failed: %s", errMsg)));
	}

	bool match_result = false;
	while (pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0) == REG_OKAY)
	{
		match_result = true;
		if (num == count)
		{
			PG_RETURN_INT32(pmatch[0].rm_so + 1);
		}
		count++;
		prev_match_end = pmatch[0].rm_eo;

		start_search = prev_match_end;

		if (pmatch[0].rm_so == pmatch[0].rm_eo)
		{
			start_search++;
		}

		if (start_search > wide_len)
		{
			break;
		}
	}

	if (count == 1 || match_result)
	{
		ereport(ERROR,
			(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
			errmsg("NO MATCH!")));
	}

	pg_regfree(&re);

	PG_RETURN_INT32(0);
}

测试：

postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,2);
 regexp_match_str 
------------------
               17
(1 row)

postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)');
 regexp_match_str 
------------------
                1
(1 row)

postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,2);
 regexp_match_str 
------------------
               17
(1 row)

postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,3);
ERROR:  NO MATCH!
STATEMENT:  select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,3);
ERROR:  NO MATCH!
postgres=#

转载于:https://my.oschina.net/yonj1e/blog/879874

chudu3961

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
正则表达式的匹配函数

正则表达式的匹配函数功能：根据正则表达式匹配目标字符串，并返回匹配字符串的开始位置参数： arg1 -- 目标字符串 arg2 -- 正则表达式 arg3 -- 从目标字符串什么位置开始匹配 arg4 -- 标识第几...
复制链接

扫一扫