把普通语料整理成ME训练语料

 

//把语料整理成ME的格式

#include <stdio.h>

#include <ctype.h>

#include <string.h>

#include <iostream>

#include <string>

#include <vector>

#include <utility>

 

#define MAXLINELEN 1024*5

#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟 1 2 3 4 5 6 7 8 9 0")

#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘  ’╗ ╚ ┐ └ (  ) … … — — —  《  》  〈  〉 · .")

#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")

 

using namespace std;

 

int trans_file(const char *in_file,const char *out_file);

int  chomp(char *srcline);

void from_seg_to_tag(const char *line,vector<pair<string,string> > &v_dst);

void from_tag_to_data(vector<pair<string,string> > &v_dst, vector<string> &v_data);

 

int main(int argc,char **argv)

{

/* if (argc != 3)

{

printf("usage:%s in_file out_file/n", argv[0]);

return(1);

}

 

trans_file(argv[1],argv[2]);

*/

trans_file("pku","pku_me");

return(1);

}

 

int chomp(char *srcline)

{

int n;

for(n = 0;srcline[n];n++)

{

if(srcline[n] == 0x0D || srcline[n] == 0x0A)

break;

}

srcline[n] = 0;

return(n);

}

 

bool ValidColumn(const string &str)

{

size_t n,num;

 

num = 0;

for(n = 0;n < str.size();)

{

if(str[n] == ' ')

{

while(n < str.size() && str[n] == ' ')

n++;

num++;

}

else

{

n++;

}

}

if(num != 3)

return(false);

else

return(true);

}

 

int trans_file(const char *in_file,const char *out_file)

{

FILE *fin,*fout,*fout_tst;

char line[MAXLINELEN];

 

fin = fopen(in_file,"rb");

if(NULL == fin)

{

printf("can't open %s/n",in_file);

return(-1);

}

fout = fopen(out_file,"wb");

fout_tst = fopen("./test.txt","wb");

while(!feof(fin))

{

fgets(line,MAXLINELEN,fin);

if(chomp(line) < 2)

continue;

vector<string> v_data;

vector<pair<string,string> > v_dst;

 

from_seg_to_tag(line,v_dst);

from_tag_to_data(v_dst, v_data);

for(size_t n = 0;n < v_data.size();n++)

{

fprintf(fout, "%s/n",v_data[n].c_str());

const char *pstr = strchr(v_data[n].c_str(),' ');

if(pstr)

{

fprintf(fout_tst,"%s/n",pstr+1);

}

}

fprintf(fout,"/n");

}

fclose(fin);

fclose(fout);

fclose(fout_tst);

return(1);

}

 

bool IsNumber(const string &word_cur)

{

const char *p = word_cur.c_str();

if (p[0]>0)

{

if(isdigit(p[0]) )

return(true);

}

if(strstr(C_NUMBER,word_cur.c_str()))

return(true);

/*English number*/

while(*p)

{

if (*p>0)

{

if(isdigit(*p) )

return(false);

}

p++;

}

return(false);

}

 

bool IsPunc(const string &word_cur)

{

if(strstr(PUNCTION,word_cur.c_str()))

return(true);

else if(strstr(E_PUNCTION,word_cur.c_str()))

return(true);

else

return(false);

}

void split_word_tgt(string &word,string &tgt,const string &src)

{

size_t n;

 

word = "";

tgt = "";

for(n = 0;n < src.length() && src[n] != '/';n++)

{

word += src[n];

}

if(src[n] == '/')

{

n++;

for(;n < src.length() ;n++)

{

if(src[n] != ' ')

tgt += src[n];

}

}

else

{

printf("error in split %s/n",src.c_str());

}

return;

}

 

/*给字附着属性信息*/

void from_tag_to_data(vector<pair<string,string> > &v_dst, vector<string> &v_data)

{

size_t n,size;

string word="";

 

size = v_dst.size() - 2;

for(n = 2;n < size;n++)

{

string tmp = "";

word = v_dst[n].first;

//current tag

tmp += v_dst[n].second; //cur tag

 

//template b

tmp += " 1_";

tmp += v_dst[n].first;

 

//template c

tmp += " 2_";

tmp += v_dst[n-1].first;

//tmp += " 3_";

//tmp += v_dst[n-2].first; 

tmp += " 4_";

tmp += v_dst[n+1].first;

//tmp += " 5_";

//tmp += v_dst[n+2].first; 

 

//template d

tmp += " 6_";

tmp += v_dst[n-1].first + "_" + v_dst[n].first;

tmp += " 7_";

tmp += v_dst[n].first + "_" + v_dst[n+1].first;

//tmp += " 8_";

//tmp += v_dst[n-2].first + "_" + v_dst[n-1].first;

//tmp += " 9_";

//tmp += v_dst[n+1].first + "_" + v_dst[n+2].first;

 

//template e

tmp += " A_";

tmp += v_dst[n-1].first + "_" + v_dst[n+1].first;

 

/*

//template f

tmp += " t2_";

tmp += v_dst[n-2].second;

tmp += " t1_";

tmp += v_dst[n-1].second; */

 

//punctuation

if(IsPunc(word) == true)

tmp += " y_punc";

else

tmp += " n_punc";

//number

if(IsNumber(word) == true)

tmp += " y_num";

else

tmp += " n_num";

v_data.push_back(tmp);

}

return;

}

 

/* get one word

*/

const char *split_char_str(const char *line,vector<string> &array)

{

const char *pline = line;

 

while(*pline && *pline != ' ')

{

if(*pline > 0)

{

string tmp = "";

while(*pline && *pline > 0 && *pline != ' ')

{

tmp += *pline;

pline++;

}

if(tmp != "")

array.push_back(tmp);

}

else

{

string tmp;

tmp = *pline;

tmp += *(pline+1);

array.push_back(tmp);

pline += 2;

}

}

return(pline);

}

 

void from_seg_to_tag(const char *line,vector<pair<string,string> > &v_dst)

{

string first = "<beg1>";

string second = "beg1";

v_dst.push_back(make_pair(first, second));

first = "<beg2>";

second = "beg2";

v_dst.push_back(make_pair(first,second));

 

const char *pline = line;

 

while(*pline)

{

if(*pline != ' ')

{

vector<string> array;

pline = split_char_str(pline,array);

 

for(size_t n = 0;n < array.size();n++)

{

if(n == 0)

{

string tmp;

if(1 == array.size())

tmp = "S";

else

tmp = "B";

v_dst.push_back(make_pair(array[n],tmp));

}

else

{

string tmp;

if((n+1) == array.size())//last one

tmp = "E";

else

tmp = "M";

v_dst.push_back(make_pair(array[n],tmp));

}

}

}

else

{

pline++; 

}

}

first = "<end1>";

second = "end1";

v_dst.push_back(make_pair(first,second));

first = "<end2>";

second = "end2";

v_dst.push_back(make_pair(first,second));

return;

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值