iconv文件编码判断转换

最新推荐文章于 2023-07-12 11:25:04 发布

whatday

最新推荐文章于 2023-07-12 11:25:04 发布

阅读量3.2k

点赞数

继我的前一篇文章《iconv用法，编码转换（一）》后，补充如何识别一个字符数组里面的文本是否是utf8格式的方法，因为对于非utf8格式的文本也进行iconv（）处理的话，会删除掉非utf8文本内容，因此调用iconv（）函数前需进行字符集判断。方法如下：

使用int IsTextUTF8(const char* str,unsigned int length);函数来识别是否为utf8类型：

[cpp]view plaincopy 
   
 #include<iconv.h>  
 #include<iostream>  
 #include<fstream>  
   
 using namespace std;  
   
 int IsTextUTF8(const char* str,unsigned int  length);  
 int main()  
 {  
     iconv_t cd = iconv_open("GBK","UTF-8");  
     if(cd == (iconv_t)(-1))  
     {  
         cout<<"Failed";  
     }  
     else   
     {  
         cout<<"Success"<<endl;  
     }  
   
     ifstream fp("1.html");  //1.html为utf8编码格式的文件  
     char * inbuf = new char[1000] ;  
     string s;  
     while(getline(fp,s))  
     {  
         inbuf = (char *)s.c_str();  
         char * in = inbuf;  
         char * outbuf = new char[1000];  
         char * out = outbuf;  
         size_t inlen = 1000;  
         size_t outlen = 1000;  
         int res = IsTextUTF8(in,s.size());  
         if(res)   
         {  
             cout<<"是utf8格式";  
             iconv(cd,&in,&inlen,&out,&outlen);  
         }  
         else   
         {  
             cout<<"不是utf8格式";  
             outbuf = inbuf;  
         }//iconv(cd,&in,&inlen,&out,&outlen);  
         cout<<outbuf;  
     }  
     iconv_close(cd);  
     return 0;  
 }  
   
   
 int IsTextUTF8(const char* str,unsigned int  length)  
 {  
     int i;  
     unsigned long nBytes=0;//UFT8可用1-6个字节编码,ASCII用一个字节  
     unsigned char chr;  
     int bAllAscii=1; //如果全部都是ASCII, 说明不是UTF-8  
     for(i=0;i<length;i++)  
     {  
         chr= *(str+i);  
         if( (chr&0x80) != 0 ) // 判断是否ASCII编码,如果不是,说明有可能是UTF-8,ASCII用7位编码,但用一个字节存,最高位标记为0,o0xxxxxxx  
             bAllAscii= 0;  
         if(nBytes==0) //如果不是ASCII码,应该是多字节符,计算字节数  
         {  
             if(chr>=0x80)  
             {  
                 if(chr>=0xFC&&chr<=0xFD)  
                     nBytes=6;  
                 else if(chr>=0xF8)  
                     nBytes=5;  
                 else if(chr>=0xF0)  
                     nBytes=4;  
           
         else if(chr>=0xE0)  
                     nBytes=3;  
                 else if(chr>=0xC0)  
                     nBytes=2;  
                 else  
                 {  
                     return 0;  
                 }  
                 nBytes--;  
             }  
         }  
         else //多字节符的非首字节,应为 10xxxxxx  
         {  
             if( (chr&0xC0) != 0x80 )  
             {  
                 return 0;  
             }  
             nBytes--;  
         }  
     }  
     if( nBytes > 0 ) //违返规则  
     {  
         return 0;  
     }  
     if( bAllAscii ) //如果全部都是ASCII, 说明不是UTF-8  
     {  
         return 0;  
     }  
     return 1;  
 }