/* * ===================================================================================== * * Filename: regex_test.c * * Description: 根据规则模板抽取命名实体 * * Version: 1.0 * Created: 2011年04月06日 10时49分16秒 * Revision: none * Compiler: gcc * * Author: 齐保元 (qby), qibaoyuan@126.com * Company: ict,gucas * * ===================================================================================== */ #include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <regex.h> #include <string.h> //规则结构体 typedef struct _RULE_{ char name[256];//规则的名称,如a+n+b+ double freq;//规则可以推导出是一个命名实体的概率,有王石提供 }RULE; //存放所有的正则表达式对象 static regex_t regexes[256]; //输入,格式为{{"蓝屏","司机"}{"a","n"}},每个词对应一个词性,下标一致 static char *input[2][500]; //declear FILE *file_open(char* file_name,char* mode); char* read_line (FILE *fp); char* parse_line(char* line,int index,char sign); void process_regex(int count,char*line); static void close_all_regex(int count); void get_corpse(int count){ FILE *fp=file_open("huge.crf_format.txt_0","r"); char *line; int i=0,j=0; char *word,*pos; while((line=read_line(fp))!=NULL){//遇到空白行 if(strlen(line)==0){ char pos1[500]={'/0'}; j=0; for(j=0;input[1][j]!=NULL && strlen(input[1][j])!=0;j++){ sprintf(pos1,"%s%s",pos1,input[1][j]); //printf("pos1:%s/n",pos1); } process_regex(count,pos1); i=j=0; continue; } word=parse_line(line,0,'/t'); pos=parse_line(line,1,'/t'); input[0][i++]=word; input[1][j++]=pos; } close_all_regex(count); } /*----------------------------------------------------------------------------- * 对每一行规则进行解析,存入结构体对象 *-----------------------------------------------------------------------------*/ RULE parse_rule(char* line){ if(line==NULL){ perror("空串"); exit(1); } RULE rule; memset(rule.name,0,sizeof(rule.name)); char *p=line; char *q=line; while(*p==' ')p++; while(*q==' ')q++; while(*q!='/t')q++;//指向regex的结尾 int index=0; while(p!=q){ if(*p=='+' || *p=='*') sprintf(rule.name,"%s%s",rule.name,"//"); sprintf(rule.name,"%s%c",rule.name,p[index]); p++; } return rule; } /*----------------------------------------------------------------------------- * 对所有的正则表达式进行编译,保存到结构提regexes,返回加载的正则表达式的个数 *-----------------------------------------------------------------------------*/ static int compile_all_regx(){ int p=0,z=0,cflags=0; char ebuf[300]; regex_t reg; char *pattern; FILE *fp=file_open("template","r"); char *line; RULE rule; while((line=read_line(fp))!=NULL){ if(strlen(line)==0)continue; printf("第%d行/n",p); /* a 0.1 b 0.2 */ rule=parse_rule(line); pattern=rule.name; printf("加载第%d个正则表达式:%s,freq:%f/n",p,pattern,rule.freq); z=regcomp(®,pattern,cflags); if(0!=z){ regerror(z,®,ebuf,sizeof(ebuf)); fprintf(stderr,"%s:pattern '%s'/n",ebuf,pattern); return; } regexes[p]=reg; p++; free(line); } return p; } /*----------------------------------------------------------------------------- * 关闭所有打开的正则表达式 *-----------------------------------------------------------------------------*/ static void close_all_regex(int count){ int i=0; for(i=0;i<count;i++){ regfree(®exes[i]); } } /*----------------------------------------------------------------------------- * 打开文件,返回句柄 *-----------------------------------------------------------------------------*/ FILE *file_open(char* file_name,char* mode){ FILE *fp; if((fp=fopen(file_name,mode))==NULL){ fprintf(stderr,"无法打开文件:%s./n",file_name); return NULL; } return fp; } /*----------------------------------------------------------------------------- * 返回一行 *-----------------------------------------------------------------------------*/ char* read_line (FILE *fp) { if(feof(fp)) return NULL; char ch; int max_len=100; char *line=(char*)malloc(max_len); char *new_line; int counter=0;//counter while( ((ch=fgetc(fp))!='/n') && !feof(fp)){ if(counter>=max_len-1){ max_len*=2; new_line=(char*)malloc(max_len); strcpy(new_line,line); free(line); line=new_line; } *(line+(counter++))=ch; } *(line+counter)='/0'; return line; } /*----------------------------------------------------------------------------- * 解析,line,要取的下标,分割符 *-----------------------------------------------------------------------------*/ char* parse_line(char* line,int index,char sign){ char *p=line; char *q=line; while(*p==' ')p++; while(*q==' ')q++; int i=0; char *ret=(char*)calloc(sizeof(char),300); while(1){ if(*q!=sign){q++;continue;} if(i<index){ i++; p=++q; continue; }else{ strncpy(ret,p,q-p); p=++q; break; } if(*q=='/0')break; } return ret; } void process_regex(int count,char *lbuf){ int p=0; char *pattern; int x,z,no=0,cflags=0; char ebuf[1280]={'/0'}; char ret[1024]; char *curr; regmatch_t pm[10]; const size_t nmatch=10; char word[200]; int word_i; int ori_i; //对输入在每个正则表达式进行匹配,选取最长的 for(p=0;p<count;p++){ ++no; a if((z=strlen(lbuf))>=0 && lbuf[z-1]=='/n') lbuf[z-1]=0; curr=lbuf; ori_i=0; while(regexec(®exes[p],curr,nmatch,pm,0)==0){ if(pm[0].rm_so==-1) break; memset(ret,0x00,sizeof(ret)); memset(word,0x00,sizeof(word)); memcpy(ret,curr+pm[0].rm_so,pm[0].rm_eo-pm[0].rm_so); for(word_i=pm[0].rm_so+ori_i;word_i<pm[0].rm_eo+ori_i;word_i++){ sprintf(word,"%s%s",word,input[0][word_i]); } printf("pos:%s->word:%s/n/n",ret,word); curr+=pm[0].rm_eo; ori_i+=pm[0].rm_eo; } } } int main(int argc,char** argv){ int count=compile_all_regx();//预编译regex get_corpse(count); return EXIT_SUCCESS; }