C++读写文件,处理UTF8文件,处理GBK中文字符

读文件

	//从文件中提取词典
	void getLexiconFromTrainData(char* filepath){
		maxLength = 0;
		lexicalItemCount=0;
		allSentenceCount=0;
		wordCount=0;


		ifstream infile;
		char a;
		string line;
		string word;



		infile.open(filepath);//打开文件
		if(!infile){
			cerr<<"error:unable to open input file: "<<infile<<endl;
		}

		//每次读取一个字符进行处理
		do{
		infile.get(a);
		if(infile.eof())
		break;
		cout<<a;
		getchar();
		}while(!infile.eof());
		

		//每次读取一行进行处理,行与行之间以回车换行分隔
		while(getline(infile,line)){
		allSentence.push_back(line);//每一行都保存到vector中
		//cout<<line;
		}
		cout<<"文件读取完毕"<<endl;
		cout<<"vector长度"<<allSentence.size()<<endl;
		


		//每次读一个词进行处理,词和词之间用空格分开
		while( infile> >word ){    
			//cout << "Read from file: "<< word<< endl;
			wordCount++;
			lexicalItem.insert(word);
			if(word.size()>maxLength){
				maxLength = word.size();
				cout<<"迄今为止,最长的词"<<word<<",长度为:"<<word.size()/2<<endl;
			}
			//getchar();
		}
		maxLength = maxLength;
		infile.close();

		cout<<"总词汇量:"<<wordCount<<endl;
		cout<<"词典词数:"<<lexicalItem.size()<<endl;
	}

写文件

//把所有的句子保存回文件
void FileIOfunc::saveAllSentenceToFile(char* filepath,vector<vector<string> > resultSentence){

		ofstream outfile;
		stringstream ss;

		//outfile.open(filepath,ios::app);//以追加方式写文件
		outfile.open(filepath);//以覆盖方式写文件

		//把所有的句子保存到文件
		for(vector<vector<string> >::iterator oneSentence = resultSentence.begin();oneSentence!=resultSentence.end();oneSentence++){
			for(vector<string>::iterator oneWord = (*oneSentence).begin();oneWord!=(*oneSentence).end();oneWord++){
				ss<<*oneWord<<" ";
			}
			ss<<endl;
		}
		outfile<<ss.str();
		cout<<"所有的句子保存回文件完成"<<endl;

		outfile.clear();
		outfile.close();
	}
处理UTF8文件

//处理utf8编码文件的函数,判断取字符串的偏移量
int utf8_char_len(char firstByte)
{
	const unsigned char kFirstBitMask = 128; // 1000000
	const unsigned char kSecondBitMask = 64; // 0100000
	const unsigned char kThirdBitMask = 32; // 0010000
	const unsigned char kFourthBitMask = 16; // 0001000
	const unsigned char kFifthBitMask = 8; // 0000100
	std::string::difference_type offset = 1;
 
	if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
		{
			if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
			{
				if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
					{
						offset = 4;
					}else{
						offset = 3;
						}
			}else{ 
				offset = 2;
					} 
			} 
return offset;
}

处理GBK中文字符

//判断一个字符是否是汉字,是则返回1,否则返回0,处理GBK文件,GBK中汉字是两个字节
int SentenceEncoder::isChineseCharacter(string str){

	char c = str.c_str()[0];
	if(c&0x80){//如果字符高位是1,则是汉字,默认是UTF8编码
		return 1;
	}else{
		return 0;
	}  

}



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值