//把普通语料整理成微软格式的CRF语料
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <iostream>
#include <string>
#include <vector>
#define MAXLINELEN 1024*5
#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟 1 2 3 4 5 6 7 8 9 0")
#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘ ’╗ ╚ ┐ └ ( ) … … — — — 《 》 〈 〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")
using namespace std;
int trans_file(const char *in_file,const char *out_file);
int chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<string> &dst_line);
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);
int main(int argc,char **argv)
{
if (argc != 3)
{
printf("usage:%s in_file out_file/n", argv[0]);
return(1);
}
trans_file(argv[1],argv[2]);
return(1);
}
int chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return(n);
}
bool ValidColumn(const string &str)
{
size_t n,num;
num = 0;
for(n = 0;n < str.size();)
{
if(str[n] == ' ')
{
while(n < str.size() && str[n] == ' ')
n++;
num++;
}
else
{
n++;
}
}
if(num != 3)
return(false);
else
return(true);
}
int trans_file(const char *in_file,const char *out_file)
{
FILE *fin,*fout;
char line[MAXLINELEN];
fin = fopen(in_file,"rb");
if(NULL == fin)
{
printf("can't open %s/n",in_file);
return(-1);
}
fout = fopen(out_file,"wb");
while(!feof(fin))
{
fgets(line,MAXLINELEN,fin);
if(chomp(line) < 2)
continue;
vector<string> v_dst,v_data;
from_seg_to_tag(line,v_dst);
from_tag_to_data(v_dst,v_data);
for(size_t n = 0;n < v_data.size();n++)
{
if(ValidColumn(v_data[n].c_str()) == true)
fprintf(fout,"%s/n",v_data[n].c_str());
else
printf("column size error =%s/n",v_data[n].c_str());
}
fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
return(1);
}
bool IsNumber(const string &word_cur)
{
if(isdigit(word_cur[0]) == true)
return(true);
if(strstr(C_NUMBER,word_cur.c_str()))
return(true);
/*English number*/
const char *pline = word_cur.c_str();
while(*pline)
{
if(isdigit(*pline) == 0)
return(false);
pline++;
}
return(false);
}
bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
return(true);
else
return(false);
}
void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;
word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
word += src[n];
}
if(src[n] == '/')
{
n++;
for(;n < src.length() ;n++)
{
if(src[n] != ' ')
tgt += src[n];
}
}
else
{
printf("error in split %s/n",src.c_str());
}
return;
}
/*给字附着属性信息*/
void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data)
{
size_t n,size;
size = v_dst.size();
for(n = 0;n < size;n++)
{
string word0,word2,word1,tgt,tmp = "";
split_word_tgt(word1,tgt,v_dst[n]);
tmp = word1;
tmp += " ";
//punctuation
if(IsPunc(word1) == true)
tmp += "y_punc";
else
tmp += "n_punc";
tmp += " ";
//number
if(IsNumber(word1) == true)
tmp += "y_num";
else
tmp += "n_num";
tmp += " ";
tmp += tgt;
v_data.push_back(tmp);
}
return;
}
const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;
while(*pline && *pline != ' ')
{
if(*pline > 0)
{
string tmp = "";
while(*pline && *pline > 0 && *pline != ' ')
{
tmp += *pline;
pline++;
}
if(tmp != "")
array.push_back(tmp);
}
else
{
string tmp;
tmp = *pline;
tmp += *(pline+1);
array.push_back(tmp);
pline += 2;
}
}
return(pline);
}
void from_seg_to_tag(const char *line,vector<string> &v_dst)
{
const char *pline = line;
while(*pline)
{
if(*pline != ' ')
{
vector<string> array;
pline = split_char_str(pline,array);
if(array.size() == 0)
{
}
else if(array.size() == 1)
{
string dst_line ;
dst_line = array[0];
dst_line += "/S ";
v_dst.push_back(dst_line);
}
else if(array.size() == 2)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
else if(array.size() == 3)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/B2 ";
v_dst.push_back(dst_line);
dst_line = array[2];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
else if(array.size() >= 4)
{
string dst_line ;
dst_line = array[0];
dst_line += "/B ";
v_dst.push_back(dst_line);
dst_line = array[1];
dst_line += "/B2 ";
v_dst.push_back(dst_line);
dst_line = array[2];
dst_line += "/B3 ";
v_dst.push_back(dst_line);
for(size_t n = 3;n < (array.size()-1);n++)
{
dst_line = array[n];
dst_line += "/M ";
v_dst.push_back(dst_line);
}
dst_line = array[array.size()-1];
dst_line += "/E ";
v_dst.push_back(dst_line);
}
}
else
{
pline++;
}
}
return;
}