继我的前一篇文章《iconv用法,编码转换(一)》 后,补充如何识别一个字符数组里面的文本是否是utf8格式的方法,因为对于非utf8格式的文本也进行iconv()处理的话,会删除掉非utf8文本内容,因此调用iconv()函数前需进行字符集判断。方法如下:
使用int IsTextUTF8(const char* str,unsigned int length);函数来识别是否为utf8类型:
- #include<iconv.h>
- #include<iostream>
- #include<fstream>
- using namespace std;
- int IsTextUTF8(const char* str,unsigned int length);
- int main()
- {
- iconv_t cd = iconv_open("GBK","UTF-8");
- if(cd == (iconv_t)(-1))
- {
- cout<<"Failed";
- }
- else
- {
- cout<<"Success"<<endl;
- }
- ifstream fp("1.html"); //1.html为utf8编码格式的文件
- char * inbuf = new char[1000] ;
- string s;
- while(getline(fp,s))
- {
- inbuf = (char *)s.c_str();
- char * in = inbuf;
- char * outbuf = new char[1000];
- char * out = outbuf;
- size_t inlen = 1000;
- size_t outlen = 1000;
- int res = IsTextUTF8(in,s.size());
- if(res)
- {
- cout<<"是utf8格式";
- iconv(cd,&in,&inlen,&out,&outlen);
- }
- else
- {
- cout<<"不是utf8格式";
- outbuf = inbuf;
- }//iconv(cd,&in,&inlen,&out,&outlen);
- cout<<outbuf;
- }
- iconv_close(cd);
- return 0;
- }
- int IsTextUTF8(const char* str,unsigned int length)
- {
- int i;
- unsigned long nBytes=0;//UFT8可用1-6个字节编码,ASCII用一个字节
- unsigned char chr;
- int bAllAscii=1; //如果全部都是ASCII, 说明不是UTF-8
- for(i=0;i<length;i++)
- {
- chr= *(str+i);
- if( (chr&0x80) != 0 ) // 判断是否ASCII编码,如果不是,说明有可能是UTF-8,ASCII用7位编码,但用一个字节存,最高位标记为0,o0xxxxxxx
- bAllAscii= 0;
- if(nBytes==0) //如果不是ASCII码,应该是多字节符,计算字节数
- {
- if(chr>=0x80)
- {
- if(chr>=0xFC&&chr<=0xFD)
- nBytes=6;
- else if(chr>=0xF8)
- nBytes=5;
- else if(chr>=0xF0)
- nBytes=4;
- else if(chr>=0xE0)
- nBytes=3;
- else if(chr>=0xC0)
- nBytes=2;
- else
- {
- return 0;
- }
- nBytes--;
- }
- }
- else //多字节符的非首字节,应为 10xxxxxx
- {
- if( (chr&0xC0) != 0x80 )
- {
- return 0;
- }
- nBytes--;
- }
- }
- if( nBytes > 0 ) //违返规则
- {
- return 0;
- }
- if( bAllAscii ) //如果全部都是ASCII, 说明不是UTF-8
- {
- return 0;
- }
- return 1;
- }