UCS-2和UTF-8转换函数

最新推荐文章于 2023-12-22 11:25:36 发布

H-G-Y

最新推荐文章于 2023-12-22 11:25:36 发布

阅读量4.7k

点赞数 2

文章标签： character byte

UCS-2使用两个字节编码，UTF-8则是一种变长编码，其兼容ASCII，汉字使用三个字节编码。UCS-2同UTF-8对应关系如下：

U-00000000 – U-0000007F: 0xxxxxxx

U-00000080 – U-000007FF: 110xxxxx 10xxxxxx

U-00000800 – U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx

U-00010000 – U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

U-00200000 – U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

U-04000000 – U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

因为UTF-8使用三个字节编码汉字，所以不考虑U-0000FFFF以后的字符。从上面的关系可以看出实现二者的转换是很容易的，下面的代码基本拷贝自： http://www.wangchao.net.cn/bbsdetail_34579.html，稍有修改。原文的函数只转换单个字符，我想要转换一个字符串。所有代码如下：

#include <iostream>
#include <string>

using namespace std;

typedef unsigned short UINT16;
typedef unsigned char UINT8;
typedef unsigned char BOOL;

#define TRUE (BOOL)(1)
#define FALSE (BOOL)(0)

BOOL UTF8_to_UCS2(const UINT8* utf8_code, UINT16* ucs2_code)
{
 UINT16 temp1, temp2;
 BOOL is_unrecognized = FALSE;
 UINT8* in = (UINT8*)utf8_code;

 if(!utf8_code || !ucs2_code)
 {
  return is_unrecognized;
 }

 while(*in != 0)
 {
  //1字节 0xxxxxxx
  //0x80=1000,0000，判断最高位是否为0，如果为0，那么是ASCII字符
  //不需要处理，直接拷贝即可
  if(0x00 == (*in & 0x80))
  {
   /* 1 byte UTF-8 Charater.*/
   *ucs2_code = *in;
   is_unrecognized = TRUE;
   in += 1; 
  }
  //2字节 110xxxxx 10xxxxxx 
  //0xe0=1110,0000
  //0xc0=1100,0000
  else if(0xc0 == (*in & 0xe0) && 0x80 == (*(in + 1) & 0xc0))
  {
   /* 2 bytes UTF-8 Charater.*/ 
   //0x1f=0001,1111，获得第一个字节的后5位
   temp1 = (UINT16)(*in & 0x1f);

   //左移6位
   temp1 <<= 6;

   //0x3f=0011,1111，获得第二个字节的后6位
   //加上上面的5位一共有11位
   temp1 |= (UINT16)(*(in + 1) & 0x3f);

   *ucs2_code = temp1;

   is_unrecognized = TRUE;

   in += 2;
  }
  //3字节 1110xxxx 10xxxxxx 10xxxxxx
  //中文要进入这一个分支
  else if( 0xe0 == (*in & 0xf0) &&
   0x80 == (*(in +1) & 0xc0) &&
   0x80 == (*(in + 2) & 0xc0)
  )
  {
   /* 3bytes UTF-8 Charater.*/
   //0x0f=0000,1111
   //取出第一个字节的低4位
   temp1 = (UINT16)(*in & 0x0f);
   temp1 <<= 12;

   //0x3f=0011,1111
   //取得第二个字节的低6位
   temp2 = (UINT16)(*(in+1) & 0x3F);
   temp2 <<= 6;

   //取得第三个字节的低6位，最后组成16位
   temp1 = temp1 | temp2 | (UINT16)(*(in+2) & 0x3F);
   *ucs2_code = temp1;

   //移动到下一个字符
   in += 3;
   is_unrecognized = TRUE;
  }
  else
  {
   /* unrecognize byte. */ 
   *ucs2_code = 0x22e0;
   is_unrecognized = FALSE;

   //直接退出循环
   break;
  }

  ucs2_code += 1;
 }

 return is_unrecognized;
}

BOOL UCS2_to_UTF8(UINT16* ucs2_code, UINT8* utf8_code)
{
 UINT8* out = utf8_code;

 if(!utf8_code)
 {
  return FALSE;
 }

 while(*ucs2_code != 0)
 {
  if(0x0080 > *ucs2_code)
  {
   /* 1 byte UTF-8 Character.*/
   *out = (UINT8)*ucs2_code;
   ++out;
  }
  else if(0x0800 > *ucs2_code)
  {
   /*2 bytes UTF-8 Character.*/
   *out = ((UINT8)(*ucs2_code >> 6)) | 0xc0;
   *(out+1) = ((UINT8)(*ucs2_code & 0x003F)) | 0x80;
   out += 2;
  }
  else
  {
   /* 3 bytes UTF-8 Character .*/
   *out = ((UINT8)(*ucs2_code >> 12)) | 0xE0;
   *(out+1) = ((UINT8)((*ucs2_code & 0x0FC0)>> 6)) | 0x80;
   *(out+2) = ((UINT8)(*ucs2_code & 0x003F)) | 0x80;
   out += 3;
  }

  //挪动两个字节
  ++ucs2_code;
 }

 return TRUE;
}

int main()
{
 wstring wstr = L"中国abcd";

 unsigned char utf_buf[1024];
 unsigned char ucs_buf[1024];

 memset(utf_buf, 0, 1024);
 memset(ucs_buf, 0, 1024);

 UCS2_to_UTF8((UINT16*)wstr.c_str(), utf_buf);

 UTF8_to_UCS2(utf_buf, (UINT16*)ucs_buf);

 return 0;
}

"中国abcd"对应的UCS-2编码为"2D 4E FD 56 61 00 62 00 63 00 64 00"，对应的UTF-8编码为"E4 B8 AD E5 9B BD 61 62 63 64"。