UNICODE(UTF-16)与UTF-8编码的相互转换

最新推荐文章于 2023-07-08 16:35:11 发布
天已青色等烟雨来
最新推荐文章于 2023-07-08 16:35:11 发布
阅读量2k
点赞数
分类专栏： C++
C++ 专栏收录该内容
43 篇文章 1 订阅
订阅专栏
我们通常所说的UNICODE其实是UTF-16，下面这几个函数实现UNICODE(UTF-16)与UTF-8编码的相互转换。
[cpp]view plaincopy 
   
 /** 
  * This file implement functions of: 
  * 
  * 1. UTF-16 character to UTF-8 chaaracter converting. 
  * 2. UTF-8 character to UTF-16 character converting. 
  * 
  * 3. UTF-16 string to UTF-8 string converting. 
  * 4. UTF-8 string to UTF-16 string converting. 
  */  
 /* Maximum bytes of a utf-8 character */  
 #define MAX_CHARACTER_SIZE    8  
 /** 
  * UnicodeToUTF8 - convert unicode char to UTF-8 char 
  * @unicode: a UNICODE(utf-16) character 
  * @p: a buffer to contain a utf-8 characters 
  * 
  * @return: One step over the end of the utf-8 character buffer 
  */  
 unsigned char * UnicodeToUTF8( int unicode, unsigned char *p)  
 {  
     unsigned char *e = NULL;  
     if((e = p))  
     {  
         if(unicode < 0x80)  
         {  
             *e++ = unicode;  
         }  
         else if(unicode < 0x800)  
         {  
             /* <11011111> < 000 0000 0000> */  
             *e++ = ((unicode >> 6) & 0x1f)|0xc0;  
             *e++ = (unicode & 0x3f)|0x80;   
         }  
         else if(unicode < 0x10000)  
         {  
             /* <11101111> <0000 0000 0000 0000> */  
             *e++ = ((unicode >> 12) & 0x0f)|0xe0;   
             *e++ = ((unicode >> 6) & 0x3f)|0x80;  
             *e++ = (unicode & 0x3f)|0x80;   
         }  
         else if(unicode < 0x200000)  
         {  
             /* <11110111> <0 0000 0000 0000 0000 0000> */  
             *e++ = ((unicode >> 18) & 0x07)|0xf0;   
             *e++ = ((unicode >> 12) & 0x3f)|0x80;  
             *e++ = ((unicode >> 6) & 0x3f)|0x80;  
             *e++ = (unicode & 0x3f)|0x80;   
         }  
         else if(unicode < 0x4000000)  
         {  
             /* <11111011> <00 0000 0000 0000 0000 0000 0000> */  
             *e++ = ((unicode >> 24) & 0x03)|0xf8 ;   
             *e++ = ((unicode >> 18) & 0x3f)|0x80;  
             *e++ = ((unicode >> 12) & 0x3f)|0x80;  
             *e++ = ((unicode >> 6) & 0x3f)|0x80;  
             *e++ = (unicode & 0x3f)|0x80;   
         }  
         else  
         {  
             /* <11111101> <0000 0000 0000 0000 0000 0000 0000 0000> */  
             *e++ = ((unicode >> 30) & 0x01)|0xfc;   
             *e++ = ((unicode >> 24) & 0x3f)|0x80;  
             *e++ = ((unicode >> 18) & 0x3f)|0x80;  
             *e++ = ((unicode >> 12) & 0x3f)|0x80;  
             *e++ = ((unicode >> 6) & 0x3f)|0x80;  
             *e++ = (unicode & 0x3f)|0x80;   
         }  
     }  
     /* Return One step over the end of the utf-8 character buffer */  
     return e;  
 }  
 /** 
  * UTF8ToUnicode - convert UTF-8 char to unicode char 
  * @ch: A buffer contain a utf-8 character 
  * @unicode: Contain the converted utf-16 character 
  * 
  * @return: Bytes count of the utf-8 character (1 ~ 6), 
  *          can be used to step to next utf-8 character when convert a utf-8 string to a utf-16 string 
  */  
 int UTF8ToUnicode (unsigned char *ch, int *unicode)  
 {  
     unsigned char *p = NULL;  
     int e = 0, n = 0;  
     if((p = ch) && unicode)  
     {  
         if(*p >= 0xfc)  
         {  
             /* 6:<11111100> */  
             e = (p[0] & 0x01) << 30;  
             e |= (p[1] & 0x3f) << 24;  
             e |= (p[2] & 0x3f) << 18;  
             e |= (p[3] & 0x3f) << 12;  
             e |= (p[4] & 0x3f) << 6;  
             e |= (p[5] & 0x3f);  
             n = 6;  
         }  
         else if(*p >= 0xf8)   
         {  
             /* 5:<11111000> */  
             e = (p[0] & 0x03) << 24;  
             e |= (p[1] & 0x3f) << 18;  
             e |= (p[2] & 0x3f) << 12;  
             e |= (p[3] & 0x3f) << 6;  
             e |= (p[4] & 0x3f);  
             n = 5;  
         }  
         else if(*p >= 0xf0)  
         {  
             /* 4:<11110000> */  
             e = (p[0] & 0x07) << 18;  
             e |= (p[1] & 0x3f) << 12;  
             e |= (p[2] & 0x3f) << 6;  
             e |= (p[3] & 0x3f);  
             n = 4;  
         }  
         else if(*p >= 0xe0)  
         {  
             /* 3:<11100000> */  
             e = (p[0] & 0x0f) << 12;  
             e |= (p[1] & 0x3f) << 6;  
             e |= (p[2] & 0x3f);  
             n = 3;  
         }  
         else if(*p >= 0xc0)   
         {  
             /* 2:<11000000> */  
             e = (p[0] & 0x1f) << 6;  
             e |= (p[1] & 0x3f);  
             n = 2;  
         }  
         else   
         {  
             e = p[0];  
             n = 1;  
         }  
         *unicode = e;  
     }  
     /* Return bytes count of this utf-8 character */  
     return n;  
 }  
 /** 
  * UnicodeStrToUTF8Str - Convert a utf-16 string to a utf-8 string 
  * @unicde_str: A utf-16 string 
  * @utf8_str: A buffer to contain utf-8 string 
  * @utf8_str_size: Maximum size of the utf-8 string buffer 
  * 
  * @return: One step over the end of the last utf-8 character 
  */  
 unsigned char * UnicodeStrToUTF8Str (unsigned short * unicode_str,  
                             unsigned char * utf8_str, int utf8_str_size)  
 {  
     int unicode = 0;  
     unsigned char *e = NULL, *s = NULL;  
     unsigned char utf8_ch[MAX_CHARACTER_SIZE];   
     s = utf8_str;  
     if ((unicode_str) && (s))  
     {  
         while ((unicode = (int) (*unicode_str++)))  
         {  
             memset (utf8_ch, 0, sizeof (utf8_ch));  
               
             if ((e = UnicodeToUTF8 (unicode, utf8_ch)) > utf8_ch)  
             {  
                 *e = '/0';  
                   
                 /* Judge whether exceed the destination buffer */  
                 if ((s - utf8_str + strlen ((const char *) utf8_ch)) >= utf8_str_size)  
                 {  
                     return s;  
                 }  
                 else  
                 {  
                     memcpy (s, utf8_ch, strlen ((const char *) utf8_ch));  
                     s += strlen ((const char *) utf8_ch);  
                     *s = '/0';  
                 }  
             }  
             else  
             {  
                 /* Converting error occurs */  
                 return s;  
             }  
         }  
     }  
       
     return s;  
 }  
 /** 
  * UTF8StrToUnicodeStr - Convert a utf-8 stirng to a utf-16 string 
  * @utf8_str: A utf-8 string 
  * @unicode_str: A buffer to contain utf-16 string 
  * @unicode_str_size: Maximum size of the utf-16 string buffer 
  * 
  * @return: Number of utf-16 character 
  */  
 int UTF8StrToUnicodeStr (unsigned char * utf8_str,  
                     unsigned short * unicode_str, int unicode_str_size)  
 {  
     int unicode = 0;  
     int n = 0;  
     int count = 0;  
     unsigned char *s = NULL;  
     unsigned short *e = NULL;  
       
     s = utf8_str;  
     e = unicode_str;  
       
     if ((utf8_str) && (unicode_str))  
     {  
         while (*s)  
         {  
             if ((n = UTF8ToUnicode (s, &unicode)) > 0)  
             {  
                 if ((count + 1) >= unicode_str_size)  
                 {  
                     return count;  
                 }  
                 else  
                 {  
                     *e = (unsigned short) unicode;  
                     e++;  
                     *e = 0;  
                       
                     /* Step to next utf-8 character */  
                     s += n;  
                 }  
             }  
             else  
             {  
                 /* Converting error occurs */  
                 return count;  
             }  
         }  
     }  
       
     return count;  
 }  
天已青色等烟雨来
关注
0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
UNICODE(UTF-16)与UTF-8编码的相互转换

我们通常所说的UNICODE其实是UTF-16，下面这几个函数实现UNICODE(UTF-16)与UTF-8编码的相互转换。[cpp] view plaincopy/** * This file implement functions of: * * 1. UTF-16 character to UTF-8 chaaract
复制链接

扫一扫