//把语料整理成ME的格式
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#define MAXLINELEN 1024*5
#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟 1 2 3 4 5 6 7 8 9 0")
#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘ ’╗ ╚ ┐ └ ( ) … … — — — 《 》 〈 〉 · .")
#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")
using namespace std;
int trans_file(const char *in_file,const char *out_file);
int chomp(char *srcline);
void from_seg_to_tag(const char *line,vector<pair<string,string> > &v_dst);
void from_tag_to_data(vector<pair<string,string> > &v_dst, vector<string> &v_data);
int main(int argc,char **argv)
{
/* if (argc != 3)
{
printf("usage:%s in_file out_file/n", argv[0]);
return(1);
}
trans_file(argv[1],argv[2]);
*/
trans_file("pku","pku_me");
return(1);
}
int chomp(char *srcline)
{
int n;
for(n = 0;srcline[n];n++)
{
if(srcline[n] == 0x0D || srcline[n] == 0x0A)
break;
}
srcline[n] = 0;
return(n);
}
bool ValidColumn(const string &str)
{
size_t n,num;
num = 0;
for(n = 0;n < str.size();)
{
if(str[n] == ' ')
{
while(n < str.size() && str[n] == ' ')
n++;
num++;
}
else
{
n++;
}
}
if(num != 3)
return(false);
else
return(true);
}
int trans_file(const char *in_file,const char *out_file)
{
FILE *fin,*fout,*fout_tst;
char line[MAXLINELEN];
fin = fopen(in_file,"rb");
if(NULL == fin)
{
printf("can't open %s/n",in_file);
return(-1);
}
fout = fopen(out_file,"wb");
fout_tst = fopen("./test.txt","wb");
while(!feof(fin))
{
fgets(line,MAXLINELEN,fin);
if(chomp(line) < 2)
continue;
vector<string> v_data;
vector<pair<string,string> > v_dst;
from_seg_to_tag(line,v_dst);
from_tag_to_data(v_dst, v_data);
for(size_t n = 0;n < v_data.size();n++)
{
fprintf(fout, "%s/n",v_data[n].c_str());
const char *pstr = strchr(v_data[n].c_str(),' ');
if(pstr)
{
fprintf(fout_tst,"%s/n",pstr+1);
}
}
fprintf(fout,"/n");
}
fclose(fin);
fclose(fout);
fclose(fout_tst);
return(1);
}
bool IsNumber(const string &word_cur)
{
const char *p = word_cur.c_str();
if (p[0]>0)
{
if(isdigit(p[0]) )
return(true);
}
if(strstr(C_NUMBER,word_cur.c_str()))
return(true);
/*English number*/
while(*p)
{
if (*p>0)
{
if(isdigit(*p) )
return(false);
}
p++;
}
return(false);
}
bool IsPunc(const string &word_cur)
{
if(strstr(PUNCTION,word_cur.c_str()))
return(true);
else if(strstr(E_PUNCTION,word_cur.c_str()))
return(true);
else
return(false);
}
void split_word_tgt(string &word,string &tgt,const string &src)
{
size_t n;
word = "";
tgt = "";
for(n = 0;n < src.length() && src[n] != '/';n++)
{
word += src[n];
}
if(src[n] == '/')
{
n++;
for(;n < src.length() ;n++)
{
if(src[n] != ' ')
tgt += src[n];
}
}
else
{
printf("error in split %s/n",src.c_str());
}
return;
}
/*给字附着属性信息*/
void from_tag_to_data(vector<pair<string,string> > &v_dst, vector<string> &v_data)
{
size_t n,size;
string word="";
size = v_dst.size() - 2;
for(n = 2;n < size;n++)
{
string tmp = "";
word = v_dst[n].first;
//current tag
tmp += v_dst[n].second; //cur tag
//template b
tmp += " 1_";
tmp += v_dst[n].first;
//template c
tmp += " 2_";
tmp += v_dst[n-1].first;
//tmp += " 3_";
//tmp += v_dst[n-2].first;
tmp += " 4_";
tmp += v_dst[n+1].first;
//tmp += " 5_";
//tmp += v_dst[n+2].first;
//template d
tmp += " 6_";
tmp += v_dst[n-1].first + "_" + v_dst[n].first;
tmp += " 7_";
tmp += v_dst[n].first + "_" + v_dst[n+1].first;
//tmp += " 8_";
//tmp += v_dst[n-2].first + "_" + v_dst[n-1].first;
//tmp += " 9_";
//tmp += v_dst[n+1].first + "_" + v_dst[n+2].first;
//template e
tmp += " A_";
tmp += v_dst[n-1].first + "_" + v_dst[n+1].first;
/*
//template f
tmp += " t2_";
tmp += v_dst[n-2].second;
tmp += " t1_";
tmp += v_dst[n-1].second; */
//punctuation
if(IsPunc(word) == true)
tmp += " y_punc";
else
tmp += " n_punc";
//number
if(IsNumber(word) == true)
tmp += " y_num";
else
tmp += " n_num";
v_data.push_back(tmp);
}
return;
}
/* get one word
*
*/
const char *split_char_str(const char *line,vector<string> &array)
{
const char *pline = line;
while(*pline && *pline != ' ')
{
if(*pline > 0)
{
string tmp = "";
while(*pline && *pline > 0 && *pline != ' ')
{
tmp += *pline;
pline++;
}
if(tmp != "")
array.push_back(tmp);
}
else
{
string tmp;
tmp = *pline;
tmp += *(pline+1);
array.push_back(tmp);
pline += 2;
}
}
return(pline);
}
void from_seg_to_tag(const char *line,vector<pair<string,string> > &v_dst)
{
string first = "<beg1>";
string second = "beg1";
v_dst.push_back(make_pair(first, second));
first = "<beg2>";
second = "beg2";
v_dst.push_back(make_pair(first,second));
const char *pline = line;
while(*pline)
{
if(*pline != ' ')
{
vector<string> array;
pline = split_char_str(pline,array);
for(size_t n = 0;n < array.size();n++)
{
if(n == 0)
{
string tmp;
if(1 == array.size())
tmp = "S";
else
tmp = "B";
v_dst.push_back(make_pair(array[n],tmp));
}
else
{
string tmp;
if((n+1) == array.size())//last one
tmp = "E";
else
tmp = "M";
v_dst.push_back(make_pair(array[n],tmp));
}
}
}
else
{
pline++;
}
}
first = "<end1>";
second = "end1";
v_dst.push_back(make_pair(first,second));
first = "<end2>";
second = "end2";
v_dst.push_back(make_pair(first,second));
return;
}