C++读写文件，处理UTF8文件，处理GBK中文字符

最新推荐文章于 2024-09-01 07:59:44 发布

zhangjcsd

最新推荐文章于 2024-09-01 07:59:44 发布

阅读量5.2k

点赞数

分类专栏： C++

本文链接：https://blog.csdn.net/zhangjcsd/article/details/35207007

版权

C++ 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

读文件

	//从文件中提取词典
	void getLexiconFromTrainData(char* filepath){
		maxLength = 0;
		lexicalItemCount=0;
		allSentenceCount=0;
		wordCount=0;


		ifstream infile;
		char a;
		string line;
		string word;



		infile.open(filepath);//打开文件
		if(!infile){
			cerr<<"error:unable to open input file: "<<infile<<endl;
		}

		//每次读取一个字符进行处理
		do{
		infile.get(a);
		if(infile.eof())
		break;
		cout<<a;
		getchar();
		}while(!infile.eof());
		

		//每次读取一行进行处理,行与行之间以回车换行分隔
		while(getline(infile,line)){
		allSentence.push_back(line);//每一行都保存到vector中
		//cout<<line;
		}
		cout<<"文件读取完毕"<<endl;
		cout<<"vector长度"<<allSentence.size()<<endl;
		


		//每次读一个词进行处理，词和词之间用空格分开
		while( infile> >word ){    
			//cout << "Read from file: "<< word<< endl;
			wordCount++;
			lexicalItem.insert(word);
			if(word.size()>maxLength){
				maxLength = word.size();
				cout<<"迄今为止，最长的词"<<word<<"，长度为："<<word.size()/2<<endl;
			}
			//getchar();
		}
		maxLength = maxLength;
		infile.close();

		cout<<"总词汇量："<<wordCount<<endl;
		cout<<"词典词数："<<lexicalItem.size()<<endl;
	}

写文件

//把所有的句子保存回文件
void FileIOfunc::saveAllSentenceToFile(char* filepath,vector<vector<string> > resultSentence){

		ofstream outfile;
		stringstream ss;

		//outfile.open(filepath,ios::app);//以追加方式写文件
		outfile.open(filepath);//以覆盖方式写文件

		//把所有的句子保存到文件
		for(vector<vector<string> >::iterator oneSentence = resultSentence.begin();oneSentence!=resultSentence.end();oneSentence++){
			for(vector<string>::iterator oneWord = (*oneSentence).begin();oneWord!=(*oneSentence).end();oneWord++){
				ss<<*oneWord<<" ";
			}
			ss<<endl;
		}
		outfile<<ss.str();
		cout<<"所有的句子保存回文件完成"<<endl;

		outfile.clear();
		outfile.close();
	}

处理UTF8文件

//处理utf8编码文件的函数，判断取字符串的偏移量
int utf8_char_len(char firstByte)
{
	const unsigned char kFirstBitMask = 128; // 1000000
	const unsigned char kSecondBitMask = 64; // 0100000
	const unsigned char kThirdBitMask = 32; // 0010000
	const unsigned char kFourthBitMask = 16; // 0001000
	const unsigned char kFifthBitMask = 8; // 0000100
	std::string::difference_type offset = 1;
 
	if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
		{
			if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
			{
				if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
					{
						offset = 4;
					}else{
						offset = 3;
						}
			}else{ 
				offset = 2;
					} 
			} 
return offset;
}

处理GBK中文字符

//判断一个字符是否是汉字，是则返回1，否则返回0，处理GBK文件，GBK中汉字是两个字节
int SentenceEncoder::isChineseCharacter(string str){

	char c = str.c_str()[0];
	if(c&0x80){//如果字符高位是1，则是汉字，默认是UTF8编码
		return 1;
	}else{
		return 0;
	}  

}