读文件
//从文件中提取词典
void getLexiconFromTrainData(char* filepath){
maxLength = 0;
lexicalItemCount=0;
allSentenceCount=0;
wordCount=0;
ifstream infile;
char a;
string line;
string word;
infile.open(filepath);//打开文件
if(!infile){
cerr<<"error:unable to open input file: "<<infile<<endl;
}
//每次读取一个字符进行处理
do{
infile.get(a);
if(infile.eof())
break;
cout<<a;
getchar();
}while(!infile.eof());
//每次读取一行进行处理,行与行之间以回车换行分隔
while(getline(infile,line)){
allSentence.push_back(line);//每一行都保存到vector中
//cout<<line;
}
cout<<"文件读取完毕"<<endl;
cout<<"vector长度"<<allSentence.size()<<endl;
//每次读一个词进行处理,词和词之间用空格分开
while( infile> >word ){
//cout << "Read from file: "<< word<< endl;
wordCount++;
lexicalItem.insert(word);
if(word.size()>maxLength){
maxLength = word.size();
cout<<"迄今为止,最长的词"<<word<<",长度为:"<<word.size()/2<<endl;
}
//getchar();
}
maxLength = maxLength;
infile.close();
cout<<"总词汇量:"<<wordCount<<endl;
cout<<"词典词数:"<<lexicalItem.size()<<endl;
}
写文件
//把所有的句子保存回文件
void FileIOfunc::saveAllSentenceToFile(char* filepath,vector<vector<string> > resultSentence){
ofstream outfile;
stringstream ss;
//outfile.open(filepath,ios::app);//以追加方式写文件
outfile.open(filepath);//以覆盖方式写文件
//把所有的句子保存到文件
for(vector<vector<string> >::iterator oneSentence = resultSentence.begin();oneSentence!=resultSentence.end();oneSentence++){
for(vector<string>::iterator oneWord = (*oneSentence).begin();oneWord!=(*oneSentence).end();oneWord++){
ss<<*oneWord<<" ";
}
ss<<endl;
}
outfile<<ss.str();
cout<<"所有的句子保存回文件完成"<<endl;
outfile.clear();
outfile.close();
}
处理UTF8文件
//处理utf8编码文件的函数,判断取字符串的偏移量
int utf8_char_len(char firstByte)
{
const unsigned char kFirstBitMask = 128; // 1000000
const unsigned char kSecondBitMask = 64; // 0100000
const unsigned char kThirdBitMask = 32; // 0010000
const unsigned char kFourthBitMask = 16; // 0001000
const unsigned char kFifthBitMask = 8; // 0000100
std::string::difference_type offset = 1;
if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
{
if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
{
if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
{
offset = 4;
}else{
offset = 3;
}
}else{
offset = 2;
}
}
return offset;
}
处理GBK中文字符
//判断一个字符是否是汉字,是则返回1,否则返回0,处理GBK文件,GBK中汉字是两个字节
int SentenceEncoder::isChineseCharacter(string str){
char c = str.c_str()[0];
if(c&0x80){//如果字符高位是1,则是汉字,默认是UTF8编码
return 1;
}else{
return 0;
}
}