写这个模块的目的在于更加方便的使用基于c语言的正则匹配,在做通信协议报文解析的时候,正则表达式匹配相当有效。而linux自有的正则表达式相关函数用起来相对来说还是比较麻烦。
例如,若想把字符串in中的mac地址和timeout和freetime提取出来,可以设计正则表达式为p:
<span style="font-family:Courier New;">char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20";
char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)";</span>
调用regex_match_all接口,可以把匹配结果都提取到内存块match之中;其中cell_num代表着正则表达式的分组数目,即小括号数目;match_num代表在字符串in中可以匹配正则表达式p的次数。结构体t_mbc相当重要, 代表正则表达一个分组小括号里面的内容(整数或者字符串的首地址)。
<span style="font-family:Courier New;">regex_match_all(in, p, &match, &cell_num, &match_num);</span>
所以一个match指针代表:
0 cell_num
match--->+-------+-------+------------+-------+
0 | t_mbc | t_mbc | ...........| t_mbc |
+-------+-------+------------+-------+
| ................................ |
|------------------------------------+
|....................................|
+---------------+------------+-------+
match_num| t_mbc | t_mbc | ...........| t_mbc |
+-------+-------+------------+-------+
匹配结果放入到match中之后,就可以方便的把值拿出来使用。
在match使用完之后,需要释放内存,调用接口函数regex_free_all即可:
<span style="font-family:Courier New;">regex_free_all(void* match, int cell_num, int match_num)</span>
源代码如下:
<span style="font-family:Courier New;">#include <stdio.h>
#include <string.h>
#include <regex.h>
#include <stdlib.h>
#include <mcheck.h>
#define SUBSLEN 10
#define EBUFLEN 128
#define BUFLEN 1024
#define reg_comp(a, b, c) regcomp(a, b ,c)
#define reg_error(a, b , c, d) regerror(a, b, c, d)
#define reg_exec(a, b, c, d, e) regexec(a, b, c, d, e)
#define reg_free(a) regfree(a)
typedef enum data_type {
integer,
string,
} e_dt;
/* content match in brace */
typedef struct match_brace_content {
e_dt type;
union {
int integer;
char *string;
} data;
} s_mbc;
/*
* Description:
* 根据正则表达式的括号数, 以及括号里面的内容, 来初始化匹配结果集模式, 暂时不支持括号嵌套的情况
*/
int init_brace_mode(char* pattern, s_mbc** out, int* found)
{
int ret = -1, str_len = 0;
char *stack = NULL;
if(pattern == NULL) {
printf("null input pointer\n");
goto err;
}
str_len = strlen(pattern);
stack = (char *)malloc(str_len);
if(stack == NULL) {
printf("no spare memory!\n");
goto err;
}
int i, j, find_left_brace = 0, stack_top = 0, brace_pair_cnt = 0;
s_mbc* braces = malloc(sizeof(s_mbc));
for(i = 0; i < str_len; i++) {
if(*(pattern + i) == '(') {
find_left_brace = 1;
continue;
}
if(*(pattern + i) == ')') {
brace_pair_cnt++;
stack[stack_top] = '\0';
find_left_brace = 0;
stack_top = 0;
braces = realloc(braces, brace_pair_cnt * sizeof(s_mbc));
/* 判断括号里面匹配的是字符串还是数字 */
for(j = 0; j < strlen(stack); j++) {
if(((stack[j] > 'a') && (stack[j] < 'z')) || ((stack[j] > 'A') && (stack[j] < 'Z'))) {
braces[brace_pair_cnt - 1].type = string;
printf("to match string!\n");
break;
}
}
if(j == strlen(stack)) {
braces[brace_pair_cnt - 1].type = integer;
printf("to match integer!\n");
}
continue;
}
if(find_left_brace == 0) {
continue;
} else {
stack[stack_top++] = *(pattern + i);
}
}
if(brace_pair_cnt > 0) {
ret = 0;
*found = brace_pair_cnt;
*out = braces;
} else {
free(braces);
}
err:
if(stack != NULL) free(stack);
return ret;
}
int regex_match_all(char *buf, char* pattern, void** match, int *cell_num, int *match_num) {
size_t len;
regex_t re;
regmatch_t subs [SUBSLEN];
char matched [BUFLEN];
char errbuf [EBUFLEN];
int err, i, find = 0, ret = -1;
char *src = buf;
err = reg_comp(&re, pattern, REG_EXTENDED);
if (err) {
len = reg_error(err, &re, errbuf, sizeof(errbuf));
printf("error: regcomp: %s\n", errbuf);
goto err_no_free;
}
s_mbc* tamplate = NULL, *one_match;
int brace_pair_num = 0;
init_brace_mode(pattern, &tamplate, &brace_pair_num); // TODO 记住释放tamplate的内存
if(re.re_nsub != brace_pair_num) {
printf("please check function init_brace_mode()!\n");
goto err0;
}
int one_match_size = brace_pair_num * sizeof(s_mbc);
void *out, *temp;
out = malloc(one_match_size);
if(out == NULL) {
printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__);
goto err0;
}
while(1) {
err = reg_exec(&re, src, (size_t) SUBSLEN, subs, 0);
if (err == REG_NOMATCH) {
printf("Sorry, no match ...\n");
goto err0;
} else if (err) {
len = reg_error(err, &re, errbuf, sizeof(errbuf));
printf("error: regexec: %s\n", errbuf);
goto err0;
}
printf("\nOK, has matched ...\n\n");
temp = realloc(out, (find + 1)* one_match_size);
if(temp == NULL) {
printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__);
goto err0;
} else {
out = temp;
}
one_match = (s_mbc*)(out + one_match_size * find);
for (i = 0; i <= re.re_nsub; i++) {
len = subs[i].rm_eo - subs[i].rm_so;
if (i == 0) {
printf ("begin: %d, len = %d ", subs[i].rm_so, len);
continue;
} else {
printf("subexpression %d begin: %d, len = %d ", i, subs[i].rm_so, len);
}
memcpy(matched, src + subs[i].rm_so, len);
matched[len] = '\0';
printf("match: %s\n", matched);
one_match[i - 1].type = tamplate[i - 1].type; /* 第一括号内容对应的下标i为1 */
if(one_match[i - 1].type == integer) {
one_match[i - 1].data.integer = atoi(matched);
} else {
one_match[i - 1].data.string = strdup(matched);
}
}
src = src + subs[i - 1].rm_so + len;
find++;
}
ret = 0;
err0:
if(tamplate != NULL) free(tamplate);
reg_free(&re);
*match = out;
*match_num = find;
*cell_num = brace_pair_num;
err_no_free:
return ret;
}
void regex_free_all(void* match, int cell_num, int match_num)
{
int i, j;
s_mbc *cell;
for(i = 0; i < match_num; i++) {
cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i);
for(j = 0; j < cell_num; j++) {
if(cell[j].type == string) free(cell[j].data.string);
}
}
free(match);
}
int main()
{
s_mbc *cell;
int cell_num = 0, match_num = 0, i, j;
void *match = NULL;
char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20";
char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)";
setenv("MALLOC_TRACE", "output", 1);
mtrace();
regex_match_all(in, p, &match, &cell_num, &match_num);
if(match_num > 0) {
for(i = 0; i < match_num; i++) {
cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i);
for(j = 0; j < cell_num; j++) {
if(cell[j].type == string) {
printf("----- string: %s\n",cell[j].data.string);
} else {
printf("----- integer: %d\n",cell[j].data.integer);
}
}
}
}
regex_free_all(match, cell_num, match_num);
return (0);
}
</span>
打印结果:
----- string: 00:22:33:44:55:77
----- integer: 1
----- integer: 10
----- string: 00:22:33:44:55:88
----- integer: 0
----- integer: 20