正则表达式的匹配函数
功能:根据正则表达式匹配目标字符串,并返回匹配字符串的开始位置
参数:
arg1 -- 目标字符串
arg2 -- 正则表达式
arg3 -- 从目标字符串什么位置开始匹配
arg4 -- 标识第几个匹配的内容(目标字符串中可能有几处可以匹配成功)
要求:
1. 输入参数为arg1,arg2时,arg3,arg4默认值为1
2. 输入参数为arg1,arg2,arg3时,arg4默认为1
3. 三种输入不同的参数在一个函数中完成
PG 有正则表达式的相关功能,可以参考PG的函数实现上述功能
PostgreSQL 正则表达式 常用函数:https://my.oschina.net/yonj1e/blog/879875
大致的实现思路:
pg_proc.h 添加函数声明,myfuncs.c 实现函数,获取参数,根据参数 arg3 修改目标字符串,编译正则表达式,匹配正则表达式,输出匹配位置,释放,添加错误提示信息。
添加函数声明:
src/include/catalog/pg_proc.h
DATA(insert OID = 6672 ( regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 2 0 23 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");
DATA(insert OID = 6673 ( regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 23 "25 25 23" _null_ _null_ _null_ _null_ _null_ regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");
DATA(insert OID = 6674 ( regexp_match_str PGNSP PGUID 12 1 10 0 0 f f f f t t i 4 0 23 "25 25 23 23" _null_ _null_ _null_ _null_ _null_ regexp_match_str _null_ _null_ _null_ ));
DESCR("Regular expression matches the target string.");
实现函数:
获取参数,返回结果
#define PG_GETARG_IF_EXISTS(n, type, defval) \
((PG_NARGS() > (n) && !PG_ARGISNULL(n)) ? PG_GETARG_##type(n) : (defval))
/*
* Regular expression matches the target string.
* author:young
*/
Datum
regexp_match_str (PG_FUNCTION_ARGS)
{
text *orginal_str = PG_GETARG_TEXT_P(0);
text *text_re = PG_GETARG_TEXT_P(1);
int32 beg = PG_GETARG_IF_EXISTS(2, INT32, 1);
int32 num = PG_GETARG_IF_EXISTS(3, INT32, 1);
PG_RETURN_INT32(0);
}
修改目标字符串从arg3开始匹配,没有输入是默认值是 1
/* Modify the target string, starting from the beg position */
char *src;
int len;
text *orig_str;
char dest[len];
src = text_to_cstring(orginal_str);
len = strlen(src);
strncpy(dest, src + beg - 1, len - beg + 1);
//dest[len - beg + 1] = '\0';
orig_str = cstring_to_text(dest);
PG 中处理正则表达式常用函数及相关参数类型的声明
/*
* the prototypes for exported functions
*/
extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
extern void pg_regfree(regex_t *);
typedef unsigned int pg_wchar;
int
pg_regcomp(regex_t *re,
const chr *string, /* 正则表达式字符串 */
size_t len, /* 正则表达式字符串长度 */
int flags,
Oid collation)
int
pg_regexec(regex_t *re, /* 已经用regcomp函数编译好的正则表达式 */
const chr *string, /* 目标字符串 */
size_t len, /* 目标字符串长度 */
size_t search_start, /* 匹配开始位置 */
rm_detail_t *details, /* NULL */
size_t nmatch, /* 是regmatch_t结构体数组的长度 */
regmatch_t pmatch[], /* regmatch_t类型的结构体数组,存放匹配文本串的位置信息 */
int flags)
pg_regcomp()、pg_regexec()编译匹配字符串的类型是pg_wchar,所以需要先对目标字符串,正则表达式进行类型转换(可参考PG内置的几个正则函数,regexp_matches()等)
int orig_len;
pg_wchar *wide_str;
int wide_len;
int text_re_len;
pg_wchar *pattern;
int pattern_len;
/* convert string to pg_wchar form for matching */
orig_len = VARSIZE_ANY_EXHDR(orig_str);
wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* Convert pattern string to wide characters */
text_re_len = VARSIZE_ANY_EXHDR(text_re);
pattern = (pg_wchar *) palloc(sizeof(pg_wchar) * (text_re_len + 1));
pattern_len = pg_mb2wchar_with_len(VARDATA_ANY(text_re), pattern, text_re_len);
编译,匹配,释放
regex_t re;
int cflags = REG_ADVANCED;
//Oid collation = PG_GET_COLLATION();
regmatch_t *pmatch;
int pmatch_len;
bool use_subpatterns = true;
int start_search = 0;
int prev_match_end = 0;
int count = 1;
pg_regcomp(&re, pattern, pattern_len, cflags, PG_GET_COLLATION());
if (use_subpatterns && re.re_nsub > 0)
{
pmatch_len = re.re_nsub + 1;
}
else
{
use_subpatterns = false;
pmatch_len = 1;
}
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
while (pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0) == REG_OKAY)
{
if (num == count)
{
PG_RETURN_INT32(pmatch[0].rm_so + 1);
}
count++;
prev_match_end = pmatch[0].rm_eo;
start_search = prev_match_end;
if (pmatch[0].rm_so == pmatch[0].rm_eo)
{
start_search++;
}
if (start_search > wide_len)
{
break;
}
}
pg_regfree(&re);
添加上参数错误提示,执行 regcomp 或者 regexec 产生错误的时候,regerror返回一个包含错误信息的字符串,完整的代码如下:
/*
* Regular expression matches the target string.
* author:young
*/
Datum
regexp_match_str (PG_FUNCTION_ARGS)
{
text *orginal_str = PG_GETARG_TEXT_P(0);
text *text_re = PG_GETARG_TEXT_P(1);
int32 beg = PG_GETARG_IF_EXISTS(2, INT32, 1);
int32 num = PG_GETARG_IF_EXISTS(3, INT32, 1);
/* Modify the target string, starting from the beg position */
char *src;
int len;
text *orig_str;
char dest[len];
src = text_to_cstring(orginal_str);
len = strlen(src);
strncpy(dest, src + beg - 1, len - beg + 1);
//dest[len - beg + 1] = '\0';
orig_str = cstring_to_text(dest);
if (!((1 <= beg && beg <= len) && (1 <= num)))
{
ereport(ERROR,
(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
errmsg("Parameters> = 1, such as 1.2 ....")));
}
int orig_len;
pg_wchar *wide_str;
int wide_len;
int text_re_len;
pg_wchar *pattern;
int pattern_len;
regex_t re;
int cflags = REG_ADVANCED;
//Oid collation = PG_GET_COLLATION();
regmatch_t *pmatch;
int pmatch_len;
bool use_subpatterns = true;
int start_search = 0;
int prev_match_end = 0;
int count = 1;
/* convert string to pg_wchar form for matching */
orig_len = VARSIZE_ANY_EXHDR(orig_str);
wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* Convert pattern string to wide characters */
text_re_len = VARSIZE_ANY_EXHDR(text_re);
pattern = (pg_wchar *) palloc(sizeof(pg_wchar) * (text_re_len + 1));
pattern_len = pg_mb2wchar_with_len(VARDATA_ANY(text_re), pattern, text_re_len);
int regcomp_result;
char errMsg[100];
regcomp_result = pg_regcomp(&re, pattern, pattern_len, cflags, PG_GET_COLLATION());
if (regcomp_result != REG_OKAY)
{
pg_regerror(regcomp_result, &re, errMsg, sizeof(errMsg));
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("invalid regular expression: %s", errMsg)));
}
if (use_subpatterns && re.re_nsub > 0)
{
pmatch_len = re.re_nsub + 1;
}
else
{
use_subpatterns = false;
pmatch_len = 1;
}
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
int regexec_result;
regexec_result = pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0);
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
{
pg_regerror(regexec_result, &re, errMsg, sizeof(errMsg));
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("regular expression failed: %s", errMsg)));
}
bool match_result = false;
while (pg_regexec(&re, wide_str, wide_len, start_search, NULL, pmatch_len, pmatch, 0) == REG_OKAY)
{
match_result = true;
if (num == count)
{
PG_RETURN_INT32(pmatch[0].rm_so + 1);
}
count++;
prev_match_end = pmatch[0].rm_eo;
start_search = prev_match_end;
if (pmatch[0].rm_so == pmatch[0].rm_eo)
{
start_search++;
}
if (start_search > wide_len)
{
break;
}
}
if (count == 1 || match_result)
{
ereport(ERROR,
(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
errmsg("NO MATCH!")));
}
pg_regfree(&re);
PG_RETURN_INT32(0);
}
测试:
postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,2);
regexp_match_str
------------------
17
(1 row)
postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)');
regexp_match_str
------------------
1
(1 row)
postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,2);
regexp_match_str
------------------
17
(1 row)
postgres=# select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,3);
ERROR: NO MATCH!
STATEMENT: select regexp_match_str('young@yahoo.com young@highgo.com','(\w+@[\w.]+com)',1,3);
ERROR: NO MATCH!
postgres=#