有那么一点点明白PCRE强大的地方了,正则表达式确实很牛掰呀,对文本行进行断句,断句的符号即为标点符号,代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "pcre.h"
#include <unistd.h>
#include <iostream>
#include <string>
#include <vector>
#include "ul_ccode.h"
#include "ul_conf.h"
#include "ul_log.h"
#define N 10240
#define M 30
int main(int argc, char *argv[])
{
if(argc < 3)
{
printf("Usage : %s infile outfile\n", argv[0]);
exit(-1);
}
FILE *fp_in;
FILE *fp_out;
fp_in = fopen(argv[1], "rb");
fp_out = fopen(argv[2], "wb");
if(NULL == fp_in || NULL == fp_out)
{
printf("FILE open failure\n");
exit(-1);
}
const char *error;
pcre *re;
int erroffset;
int ovector[M];
int rc = 0, i = 0;
char buffer[N];
memset(buffer, 0, N);
// pattern
//char pattern[N] = "(0|1|2|3|4|5|6|7|8|9){5,}";
char pattern[N] = "(,|。| |\t|“|”|;){1,}";
printf("%s\n", pattern);
ul_trans2bj(pattern, buffer);
ul_trans2lower(buffer, pattern);
printf("%s\n", pattern);
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if(NULL == re)
{
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
exit(-1);
}
char line[N], line1[N], line2[N], line_out[N];
char *p1 = NULL, *p2 = NULL;
int len = 0;
int num = 0;
std::string str;
std::vector<std::string> svec;
std::vector<std::string>::iterator iter;
while(fgets(line, N, fp_in))
{
//strncpy(line1, line, N);
len = strlen(line);
while((line[len - 1] == '\n' || line[len - 1] == ' ' || line[len - 1] == '\t') && len > 0)
-- len;
line[len] = '\0';
printf("%s\n", line);
ul_trans2bj(line, line1);
//printf("%s\n", line1);
ul_trans2lower(line1, line2);
//printf("%s\n", line2);
p2 = &line2[0];
svec.clear();
while(p2 != NULL)
{
p1 = p2;
rc = pcre_exec(re, NULL, p2, strlen(p2), 0, 0, ovector, M);
if(rc < 0)
{
//printf("NO match...\n");
//fprintf(fp_out, "%s\n", line1);
//printf("%s\n", p2);
strncpy(line_out, p2, strlen(p2));
line_out[strlen(p2)] = '\0';
str = std::string(line_out);
svec.push_back(str);
p2 = NULL;
continue;
}
len = strlen(p2);
if(ovector[0] > 0)
{
strncpy(line_out, p2, ovector[0]);
line_out[ovector[0]] = '\0';
str = std::string(line_out);
svec.push_back(str);
}
if(ovector[1] < len)
{
p2 = p1 + ovector[1];
//strncpy(line_out, p2, len - ovector[1]);
//line_out[len - ovector[1]] = '\0';
//str = std::string(line_out);
//svec.push_back(str);
}else{
p2 = NULL;
continue;
}
}
if(svec.size() > 0)
{
for(iter = svec.begin(); iter != svec.end(); ++ iter)
printf("%s\n", (*iter).c_str());
}
//strncpy(line1, line, N);
//if(0 == len)
// continue;
//line[len] = '\0';
//strncpy(line1, line, N);
fprintf(fp_out, "%s\n", line2);
}
fclose(fp_in);
fclose(fp_out);
pcre_free(re);
return 0;
}
其中的两个函数的意思,
ul_trans2bj(pattern, buffer);
//将字符串转为半角字符串
ul_trans2lower(buffer, pattern);
//转为小写字符串