#include<iostream>
#include<fstream>
#include<cstring>
#include<algorithm>
#include<dirent.h>
#include<stdio.h>
#include<errno.h>
#include<malloc.h>
using namespace std;
#define MAXN 500
#define _match(a,b) ((a)==(b))
#define N_NAME 200
#define N_FIX 500
#define buffersize 10000000
struct Attribute //define attribute class
{
char name[N_NAME];
char prefix[N_FIX];
char start[N_FIX];
char suffix[N_FIX];
}att;
struct att_text
{
int att_id;
int s,e;
}ex_text[100];
/******************************************************************************/
/*字符串模式匹配*/
int pat_match(int ls,char str[],int lp,char pat[]){
int fail[MAXN]={-1},i=0,j;
for (j=1;j<lp;j++)
{
for (i=fail[j-1];i>=0&&!_match(pat[i+1],pat[j]);i=fail[i]);
fail[j]=(_match(pat[i+1],pat[j])?i+1:-1);
}
for (i=j=0;i<ls&&j<lp;i++)
{
if (_match(str[i],pat[j]))
j++;
else if (j)
j=fail[j-1]+1,i--;
}
return j==lp?(i-lp):-1;
}
/********************************************************************************/
/*多重自动机,属性抽取*/
int multi_ex(char org_text[])
{
int text_len = strlen(org_text);
int i=0,ls,x,lps,lpe,lps2;
int ex_num=0;//the number of attributes which have been founded
ls=text_len;
lps=strlen(att.prefix);
lps2=strlen(att.start);
lpe=strlen(att.suffix);
while(1)
{
x=pat_match(ls,org_text+(text_len-ls),lps,att.prefix);
if(x==-1)
break;
ls=ls-x-lps;
x=pat_match(ls,org_text+(text_len-ls),lps2,att.start);
if(x==-1)
continue;
else
ex_text[ex_num].s=text_len-ls+x+lps2;
ls=ls-x-lps2;
x=pat_match(ls,org_text+(text_len-ls),lpe,att.suffix);
if(x==-1)
continue;
else
ex_text[ex_num].e=text_len-ls+x;
ex_text[ex_num].att_id=i;
ex_num++;
ls=ls-x-1;
}
return ex_num;
}
/*********************************************************************************/
/*
参数 :
org_text:待抽取的文本内容
name :抽取的属性名
prefix :抽取串开始的位置
start :抽取的开始位置
suffix :抽取的结束位置
返回值:
result :抽取的结果,每条记录以'\n'分割
*/
string extract(char org_text[], char name[], char prefix[] ,char start[], char suffix[])
{
//获得待提取的属性信息
strcpy(att.name,name);
strcpy(att.prefix,prefix);
strcpy(att.start,start);
strcpy(att.suffix,suffix);
int text_len = strlen(org_text);
int result_num = multi_ex(org_text);
string result;
//输出提取结果
int i=0,j;
for(i=0;i<result_num;i++)
{
for(j=ex_text[i].s;j<ex_text[i].e;j++)
{
//fprintf(stdout,"%c",org_text[j]);
result+=org_text[j];
}
//fprintf(stdout,"\n");
result+='\n';
}
return result;
}
/*
int main()
{
char name[100] = "title";
char prefix[100] = "<title>">";
char start[100] = "";
char suffix[100] = "</title>";
char org_text[buffersize];
memset(org_text,0,sizeof(org_text));
FILE *fp;
fp=freopen("1.html","r",stdin);
if(fp==NULL)
perror("failed fopen");
fread(org_text,sizeof(char),buffersize,fp);
fclose(fp);
string result;
result = extract(org_text,name, prefix, start, suffix);
cout<<result;
return 0;
}
*/
文本抽取函数 比正则表达式好用
最新推荐文章于 2020-12-14 20:31:55 发布