linux C中使用正则表达式
对应的头文件是 #include <regex.h>
编译正则表达式 regcomp()
函数原型:
int regcomp(regex_t *preg, const char *regex, int cflags);
- regex_t 是一个结构体数据类型,用来存放编译后的正则表达式,其结构为
struct re_pattern_buffer
{
/* Space that holds the compiled pattern. It is declared as
`unsigned char *' because its elements are sometimes used as
array indexes. */
unsigned char *__REPB_PREFIX(buffer);
/* Number of bytes to which `buffer' points. */
unsigned long int __REPB_PREFIX(allocated);
/* Number of bytes actually used in `buffer'. */
unsigned long int __REPB_PREFIX(used);
/* Syntax setting with which the pattern was compiled. */
reg_syntax_t __REPB_PREFIX(syntax);
/* Pointer to a fastmap, if any, otherwise zero. re_search uses the
fastmap, if there is one, to skip over impossible starting points
for matches. */
char *__REPB_PREFIX(fastmap);
/* Either a translate table to apply to all characters before
comparing them, or zero for no translation. The translation is
applied to a pattern when it is compiled and to a string when it
is matched. */
__RE_TRANSLATE_TYPE __REPB_PREFIX(translate);
/* Number of subexpressions found by the compiler. */
size_t re_nsub;
/* Zero if this pattern cannot match the empty string, one else.
Well, in truth it's used only in `re_search_2', to see whether or
not we should use the fastmap, so we don't set this absolutely
perfectly; see `re_compile_fastmap' (the `duplicate' case). */
unsigned __REPB_PREFIX(can_be_null) : 1;
/* If REGS_UNALLOCATED, allocate space in the `regs' structure
for `max (RE_NREGS, re_nsub + 1)' groups.
If REGS_REALLOCATE, reallocate space if necessary.
If REGS_FIXED, use what's there. */
#ifdef __USE_GNU
# define REGS_UNALLOCATED 0
# define REGS_REALLOCATE 1
# define REGS_FIXED 2
#endif
unsigned __REPB_PREFIX(regs_allocated) : 2;
/* Set to zero when `regex_compile' compiles a pattern; set to one
by `re_compile_fastmap' if it updates the fastmap. */
unsigned __REPB_PREFIX(fastmap_accurate) : 1;
/* If set, `re_match_2' does not return information about
subexpressions. */
unsigned __REPB_PREFIX(no_sub) : 1;
/* If set, a beginning-of-line anchor doesn't match at the beginning
of the string. */
unsigned __REPB_PREFIX(not_bol) : 1;
/* Similarly for an end-of-line anchor. */
unsigned __REPB_PREFIX(not_eol) : 1;
/* If true, an anchor at a newline matches. */
unsigned __REPB_PREFIX(newline_anchor) : 1;
};
- preg 就是指向regex_t类型结构体的指针。用来存放编译后的正则匹配式。
- regex 是指向我们写好的正则表达式的指针。
- cflags 有如下几个值,用来调控正则表达式,可以使用0个或者多个
REG_EXTENDED :设置后使用扩展正则表达式
REG_ICASE:设置后不区分大小写
REG_NOSUB: 设置后不回复匹配成功的位置
REG_NEWLINE: 设置后匹配任意字符,不识别换行符
匹配正则表达式 regexec()
regexec的原型
int regexec(const regex_t *preg, const char *string, size_t nmatch,regmatch_t pmatch[], int eflags);
- greg是编译后的正则表达式
- string是目标文本串
- nmatch是regmatch_t结构体数组的长度
- regmatch_t结构体数组用来存储匹配到的结果数据,其结构体原型如下:
typedef struct {
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
rm_so 表示匹配文本串在目标文本串的开始位置
rm_eo 表示匹配文本串在目标文本串的结束位置
- eflags有两个值
REG_NOTBOL:匹配行的开始符号始终不匹配,参考上面的REG_NEWLINE.
The match-beginning-of-line operator always fails to match (but see the compilation flag REG_NEWLINE above). This flag may be used when different portions of a string are passed to regexec() and the beginning of the string should not be interpreted as the beginning of the line.
REG_NOTEOL:匹配行的结束符号始终不匹配,参考上面的REG_NEWLINE.
释放正则表达式 regfree()
regfree()原型
void regfree(regex_t *preg);
当使用完编译好的正则表达式后,或者要重新编译其他正则表达式的时候可以用这个函数清空regex_t结构体里面的内容。
测试用例
以下用例是从读取一个文本,并从文本中匹配目标字符串
#include <stdio.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <regex.h>
int main(int argc,char** argv)
{
int n,len,count;
int t;
char buffer[512];
regmatch_t pmatch[2];
const size_t nmatch = 2;
regex_t reg;
char str[1024*1024];
char *p = NULL;
const char * pattern = "href=\"\\s*/\(book/[0-9]{1,6}\)\\s*";
memset(str,0,sizeof(str));
n = 0;
count = 0;
int fd = open(argv[1],O_RDONLY);
if(fd < 0)
{
printf("file: %s open error\n",argv[1]);
return -1;
}
while ((n = read(fd, str+count,1024)) != 0){
if (n == -1)
{
printf("file read error\n");
return -1;
}
count += n;
}
close(fd);
printf("\nfile read over! begn URL analyse now...\n");
p = str;
if((t = regcomp(®,pattern,REG_EXTENDED)) != 0)
{
regerror(t, ®, buffer, sizeof buffer);
fprintf(stderr,"grep: %s (%s)\n",buffer,pattern);
return -1;
}
fprintf(stderr,"grep: %s (%s)\n",buffer,pattern);//查看系统中的正则表达式
while(regexec(®,p,nmatch,pmatch,0) != REG_NOMATCH)
{
len = (pmatch[1].rm_eo - pmatch[1].rm_so);
p = p + pmatch[1].rm_so;
char *tmp = (char *)calloc(len+1,1);
strncpy(tmp,p,len);
tmp[len] = '\0';
p = p + len + (pmatch[0].rm_eo - pmatch[1].rm_eo);
printf("%s\n",tmp);
}
return 0;
}
运行截图:
测试文本:
链接: https://pan.baidu.com/s/1qASJOJ-XyBdViElONmG5xw 提取码: tr4t