UCS-2使用两个字节编码,UTF-8则是一种变长编码,其兼容ASCII,汉字使用三个字节编码。UCS-2同UTF-8对应关系如下:
U-00000000 – U-0000007F:
0xxxxxxx
U-00000080 – U-000007FF:
110xxxxx 10xxxxxx
U-00000800 – U-0000FFFF:
1110xxxx 10xxxxxx 10xxxxxx
U-00010000 – U-001FFFFF:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 – U-03FFFFFF:
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 – U-7FFFFFFF:
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
因为UTF-8使用三个字节编码汉字,所以不考虑U-0000FFFF以后的字符。从上面的关系可以看出实现二者的转换是很容易的,下面的代码基本拷贝自:
http://www.wangchao.net.cn/bbsdetail_34579.html,稍有修改。原文的函数只转换单个字符,我想要转换一个字符串。所有代码如下:
#include <iostream>
#include <string>
using namespace std;
typedef unsigned short UINT16;
typedef unsigned char UINT8;
typedef unsigned char BOOL;
#define TRUE (BOOL)(1)
#define FALSE (BOOL)(0)
BOOL UTF8_to_UCS2(const UINT8* utf8_code, UINT16* ucs2_code)
{
UINT16 temp1, temp2;
BOOL is_unrecognized = FALSE;
UINT8* in = (UINT8*)utf8_code;
if(!utf8_code || !ucs2_code)
{
return is_unrecognized;
}
while(*in != 0)
{
//1字节 0xxxxxxx
//0x80=1000,0000,判断最高位是否为0,如果为0,那么是ASCII字符
//不需要处理,直接拷贝即可
if(0x00 == (*in & 0x80))
{
/* 1 byte UTF-8 Charater.*/
*ucs2_code = *in;
is_unrecognized = TRUE;
in += 1;
}
//2字节 110xxxxx 10xxxxxx
//0xe0=1110,0000
//0xc0=1100,0000
else if(0xc0 == (*in & 0xe0) && 0x80 == (*(in + 1) & 0xc0))
{
/* 2 bytes UTF-8 Charater.*/
//0x1f=0001,1111,获得第一个字节的后5位
temp1 = (UINT16)(*in & 0x1f);
//左移6位
temp1 <<= 6;
//0x3f=0011,1111,获得第二个字节的后6位
//加上上面的5位一共有11位
temp1 |= (UINT16)(*(in + 1) & 0x3f);
*ucs2_code = temp1;
is_unrecognized = TRUE;
in += 2;
}
//3字节 1110xxxx 10xxxxxx 10xxxxxx
//中文要进入这一个分支
else if( 0xe0 == (*in & 0xf0) &&
0x80 == (*(in +1) & 0xc0) &&
0x80 == (*(in + 2) & 0xc0)
)
{
/* 3bytes UTF-8 Charater.*/
//0x0f=0000,1111
//取出第一个字节的低4位
temp1 = (UINT16)(*in & 0x0f);
temp1 <<= 12;
//0x3f=0011,1111
//取得第二个字节的低6位
temp2 = (UINT16)(*(in+1) & 0x3F);
temp2 <<= 6;
//取得第三个字节的低6位,最后组成16位
temp1 = temp1 | temp2 | (UINT16)(*(in+2) & 0x3F);
*ucs2_code = temp1;
//移动到下一个字符
in += 3;
is_unrecognized = TRUE;
}
else
{
/* unrecognize byte. */
*ucs2_code = 0x22e0;
is_unrecognized = FALSE;
//直接退出循环
break;
}
ucs2_code += 1;
}
return is_unrecognized;
}
BOOL UCS2_to_UTF8(UINT16* ucs2_code, UINT8* utf8_code)
{
UINT8* out = utf8_code;
if(!utf8_code)
{
return FALSE;
}
while(*ucs2_code != 0)
{
if(0x0080 > *ucs2_code)
{
/* 1 byte UTF-8 Character.*/
*out = (UINT8)*ucs2_code;
++out;
}
else if(0x0800 > *ucs2_code)
{
/*2 bytes UTF-8 Character.*/
*out = ((UINT8)(*ucs2_code >> 6)) | 0xc0;
*(out+1) = ((UINT8)(*ucs2_code & 0x003F)) | 0x80;
out += 2;
}
else
{
/* 3 bytes UTF-8 Character .*/
*out = ((UINT8)(*ucs2_code >> 12)) | 0xE0;
*(out+1) = ((UINT8)((*ucs2_code & 0x0FC0)>> 6)) | 0x80;
*(out+2) = ((UINT8)(*ucs2_code & 0x003F)) | 0x80;
out += 3;
}
//挪动两个字节
++ucs2_code;
}
return TRUE;
}
int main()
{
wstring wstr = L"中国abcd";
unsigned char utf_buf[1024];
unsigned char ucs_buf[1024];
memset(utf_buf, 0, 1024);
memset(ucs_buf, 0, 1024);
UCS2_to_UTF8((UINT16*)wstr.c_str(), utf_buf);
UTF8_to_UCS2(utf_buf, (UINT16*)ucs_buf);
return 0;
}
"中国abcd"对应的UCS-2编码为"2D 4E FD 56 61 00 62 00 63 00 64 00",对应的UTF-8编码为"E4 B8 AD E5 9B BD 61 62 63 64"。