把普通语料整理成微软格式的CRF语料

 

//把普通语料整理成微软格式的CRF语料

 

#include <stdio.h>

#include <ctype.h>

#include <string.h>

#include <iostream>

#include <string>

#include <vector>

 

#define MAXLINELEN 1024*5

#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟 1 2 3 4 5 6 7 8 9 0")

#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘  ’╗ ╚ ┐ └ (  ) … … — — —  《  》  〈  〉 · .")

#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")

 

using namespace std;

 

int trans_file(const char *in_file,const char *out_file);

int  chomp(char *srcline);

void from_seg_to_tag(const char *line,vector<string> &dst_line);

void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

 

int main(int argc,char **argv)

{

if (argc != 3)

{

printf("usage:%s in_file out_file/n", argv[0]);

return(1);

}

trans_file(argv[1],argv[2]);

return(1);

}

 

int chomp(char *srcline)

{

int n;

for(n = 0;srcline[n];n++)

{

if(srcline[n] == 0x0D || srcline[n] == 0x0A)

break;

}

srcline[n] = 0;

return(n);

}

 

bool ValidColumn(const string &str)

{

size_t n,num;

num = 0;

for(n = 0;n < str.size();)

{

if(str[n] == ' ')

{

while(n < str.size() && str[n] == ' ')

n++;

num++;

}

else

{

n++;

}

}

if(num != 3)

return(false);

else

return(true);

}

 

int trans_file(const char *in_file,const char *out_file)

{

FILE *fin,*fout;

char line[MAXLINELEN];

 

fin = fopen(in_file,"rb");

if(NULL == fin)

{

printf("can't open %s/n",in_file);

return(-1);

}

fout = fopen(out_file,"wb");

while(!feof(fin))

{

fgets(line,MAXLINELEN,fin);

if(chomp(line) < 2)

continue;

vector<string> v_dst,v_data;

from_seg_to_tag(line,v_dst);

from_tag_to_data(v_dst,v_data);

 

for(size_t n = 0;n < v_data.size();n++)

{

if(ValidColumn(v_data[n].c_str()) == true)

fprintf(fout,"%s/n",v_data[n].c_str());

else

printf("column size error =%s/n",v_data[n].c_str());

}

fprintf(fout,"/n");

}

fclose(fin);

fclose(fout);

return(1);

}

 

bool IsNumber(const string &word_cur)

{

if(isdigit(word_cur[0]) == true)

return(true);

if(strstr(C_NUMBER,word_cur.c_str()))

return(true);

/*English number*/

const char *pline = word_cur.c_str();

while(*pline)

{

if(isdigit(*pline) == 0)

return(false);

pline++;

}

return(false);

}

 

bool IsPunc(const string &word_cur)

{

if(strstr(PUNCTION,word_cur.c_str()))

return(true);

else if(strstr(E_PUNCTION,word_cur.c_str()))

return(true);

else

return(false);

}

void split_word_tgt(string &word,string &tgt,const string &src)

{

size_t n;

word = "";

tgt = "";

for(n = 0;n < src.length() && src[n] != '/';n++)

{

word += src[n];

}

if(src[n] == '/')

{

n++;

for(;n < src.length() ;n++)

{

if(src[n] != ' ')

tgt += src[n];

}

}

else

{

printf("error in split %s/n",src.c_str());

}

return;

}

 

/*给字附着属性信息*/

void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)

{

size_t n,size;

 

size = v_dst.size();

for(n = 0;n < size;n++)

{

string word0,word2,word1,tgt,tmp = "";

split_word_tgt(word1,tgt,v_dst[n]);

tmp = word1;

tmp += " ";

//punctuation

if(IsPunc(word1) == true)

tmp += "y_punc";

else

tmp += "n_punc";

tmp += " ";

//number

if(IsNumber(word1) == true)

tmp += "y_num";

else

tmp += "n_num";

tmp += " ";

tmp += tgt;

v_data.push_back(tmp);

}

return;

}

 

const char *split_char_str(const char *line,vector<string> &array)

{

const char *pline = line;

while(*pline && *pline != ' ')

{

if(*pline > 0)

{

string tmp = "";

while(*pline && *pline > 0 && *pline != ' ')

{

tmp += *pline;

pline++;

}

if(tmp != "")

array.push_back(tmp);

}

else

{

string tmp;

tmp = *pline;

tmp += *(pline+1);

array.push_back(tmp);

pline += 2;

}

}

return(pline);

}

 

void from_seg_to_tag(const char *line,vector<string> &v_dst)

{

const char *pline = line;

while(*pline)

{

if(*pline != ' ')

{

vector<string> array;

pline = split_char_str(pline,array);

if(array.size() == 0)

{

}

else if(array.size() == 1)

{

string dst_line ;

dst_line = array[0];

dst_line += "/S ";

v_dst.push_back(dst_line);

}

else if(array.size() == 2)

{

string dst_line ;

dst_line = array[0];

dst_line += "/B ";

v_dst.push_back(dst_line);

dst_line = array[1];

dst_line += "/E ";

v_dst.push_back(dst_line);

}

else if(array.size() == 3)

{

string dst_line ;

dst_line = array[0];

dst_line += "/B ";

v_dst.push_back(dst_line);

dst_line = array[1];

dst_line += "/B2 ";

v_dst.push_back(dst_line);

dst_line = array[2];

dst_line += "/E ";

v_dst.push_back(dst_line);

}

else if(array.size() >= 4)

{

string dst_line ;

dst_line = array[0];

dst_line += "/B ";

v_dst.push_back(dst_line);

dst_line = array[1];

dst_line += "/B2 ";

v_dst.push_back(dst_line);

dst_line = array[2];

dst_line += "/B3 ";

v_dst.push_back(dst_line);

for(size_t n = 3;n < (array.size()-1);n++)

{

dst_line = array[n];

dst_line += "/M ";

v_dst.push_back(dst_line);

}

dst_line = array[array.size()-1];

dst_line += "/E ";

v_dst.push_back(dst_line);

}

}

else

{

pline++; 

}

}

return;

}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值