ASCII码:是指单字节字符集;
Unicode:本文特指按Big Endian存储的字符集;
UTF-8:一种变长的编码字符集。它可以使用1~6个字节表示一个符号, 根据不同的符号而变化字节长度。
UTF-8的编码规则如下:
1) 对于单字节的符号, 字节的第一位设为0, 后面7位为这个符号的unicode码. 因此对于英语字母, UTF-8编码和ASCII码是相同的。
2) 对于n字节的符号(n>1), 第一个字节的前n位都设为1, 第n+1位设为0, 后面字节的前两位一律设为10. 剩下的没有提及的二进制位, 全部为这个符号的unicode码。
下表总结了编码规则, 字母x表示可用编码的位
| Unicode符号范围 | UTF-8编码方式
n | (十六进制) | (二进制)
---+-----------------------+------------------------------------------------------
1 | 0000 0000 - 0000 007F | 0xxxxxxx
2 | 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
3 | 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
4 | 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5 | 0020 0000 - 03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1. ASCII与UTF-8之间的转换
// ASCII字符串转UTF_8
/*****************************************************************************
* 将ASCII数据转换成的UTF8编码字节流.
*
* 参数:
* pAscii 指向输入ASCII缓冲区
* pDes 指向输出缓冲区, 其保存的数据即是UTF-8编码值,
*
* 返回值:
* 成功则返回该字符的UTF8编码所占用的字节数; 失败则返回0.
*
* 注意:
* 请保证 pOutput 缓冲区有最少有 6 倍Unicode缓冲 字节的空间大小!
****************************************************************************/
int AsciiToUTF8(unsigned char* pAscii, unsigned char* pDes)
{
int nIndex = 0;
int nCount = 0;
unsigned char* pSource = pAscii;
unsigned char* p = pDes;
unsigned char szTemp[10];
unsigned long w = 0;
while(*pSource != 0x00)
{
if((*pSource) < 0x7F)
{
if(p)
{
*p = *pSource;
p++;
}
nCount++;
}
else
{
//w = (*pSource<<8) | *(pSource+1);
memcpy(szTemp, pSource, 2);
szTemp[2] = 0x00;
ANSIToUnicode((char*)szTemp, (wchar_t*)&w);
nIndex = unicode_to_utf8_one((unsigned long)w, (unsigned char*)szTemp, 10);
if(p)
{
memcpy(p, szTemp, nIndex);
p += nIndex;
}
nCount += nIndex;
pSource++;
}
pSource++;
}
if(p)*p = 0x00;
return nCount;
}
// UTF8字符串转ASCII字符
/*****************************************************************************
* 将UTF8数据转换成的ASCII编码字节流.
*
* 参数:
* pUtf8 指向输入UTF_8缓冲区
* pDes 指向输出缓冲区, 其保存的数据即是ASCII编码值,
*
* 返回值:
* 成功则返回该ASCII字符所占用的字节数; 失败则返回0.
*
* 注意:
****************************************************************************/
int UTF8ToAscii(unsigned char* pUtf8, unsigned char* pAscii)
{
int nIndex = 0;
int nCount = 0;
unsigned char* pSource = pUtf8;
unsigned char* p = pAscii;
unsigned long w = 0;
while(*pSource != 0x00)
{
if((*pSource) < 0x7F)
{
if(p)
{
*p = *pSource;
p++;
}
nCount++;
pSource++;
}
else
{
nIndex = utf8_to_unicode_one(pSource, (unsigned long*)&w);
pSource += nIndex;
if(p)
{
Unicode2Acsi((wchar_t*)&w, (char*)p);
p += 2;
//*p++ = (unsigned char)(w >> 8);
//*p++ = (unsigned char)w;
}
nCount += 2;
}
}
if(p)*p = 0x00;
return nCount;
}
2. ASCII与Unicode之间的转换
// ASCII字符串转Unicode
/*****************************************************************************
* 将ASCII数据转换成的Unicode(big endian)编码字节流.
*
* 参数:
* pANSI 指向输入ASCII缓冲区
* pUnicode 指向输出缓冲区, 其保存的数据即是Unicode(big endian)编码字节流,传
* 入长度为空时,查询所需要
*
* 返回值:
* 成功则返回该字符的Unicode编码所占用的字节数; 失败则返回0.
*
* 注意:
* 请保证 pUnicode 缓冲区空间大小!
****************************************************************************/
int WINAPI ASCIIToUnicode(char* pANSI, WCHAR *pUnicode)
{
int len = (int)strlen(pANSI);
int unicodeLen = ::MultiByteToWideChar( CP_ACP,
0,
pANSI,
-1,
NULL,
0 );
unicodeLen = ::MultiByteToWideChar( CP_ACP,
0,
pANSI,
-1,
(LPWSTR)pUnicode,
unicodeLen );
return unicodeLen;
}
// Unicode字符串转ASCII
/*****************************************************************************
* 将Unicode(big endian)编码字节流转换成的ASCII数据
*
* 参数:
* pUnicode 指向输出缓冲区, 其保存的数据即是Unicode(big endian)编码字节流
* pANSI 指向输入ASCII缓冲区,传入长度为空时,查询所需要
*
* 返回值:
* 成功则返回该字符的Unicode编码所占用的字节数; 失败则返回0.
*
* 注意:
* 请保证 pANSI 缓冲区的空间大小!
****************************************************************************/
int WINAPI UnicodeToANSI(WCHAR *pUnicode, char* pANSI)
{
int iTextLen;
// wide char to multi char
iTextLen = WideCharToMultiByte( CP_ACP,
0,
pUnicode,
-1,
NULL,
0,
NULL,
NULL );
iTextLen = ::WideCharToMultiByte( CP_ACP,
0,
pUnicode,
-1,
pANSI,
iTextLen,
NULL,
NULL );
return iTextLen;
}
3. UTF-8与Unicode之间的转换
/*****************************************************************************
* 将一个字符的UTF8编码转换成Unicode(UCS-2和UCS-4)编码.
*
* 参数:
* pInput 指向输入缓冲区, 以UTF-8编码
* Unic 指向输出缓冲区, 其保存的数据即是Unicode编码值,
* 类型为unsigned long .
*
* 返回值:
* 成功则返回该字符的UTF8编码所占用的字节数; 失败则返回0.
*
* 注意:
* 1. UTF8没有字节序问题, 但是Unicode有字节序要求;
* 字节序分为大端(Big Endian)和小端(Little Endian)两种;
* 在Intel处理器中采用小端法表示, 在此采用小端法表示. (低地址存低位)
****************************************************************************/
int utf8_to_unicode_one(const unsigned char* pInput, unsigned long *Unic)
{
if(pInput == NULL || Unic == NULL)
return 0;
// b1 表示UTF-8编码的pInput中的高字节, b2 表示次高字节, ...
char b1, b2, b3, b4, b5, b6;
*Unic = 0x00; // 把 *Unic 初始化为全零
//int utfbytes = enc_get_utf8_size(*pInput);
int utfbytes = 0;
unsigned char utf8_0 = pInput[0];
if(pInput[0] < 0x80)
utfbytes = 1;
else
{
//utf8_0 = pInput[0];
do
{
if((utf8_0 & 0x80) == 0x80)
utfbytes++;
utf8_0 <<= 1;
} while ((utf8_0&0x80) == 0x80);
}
unsigned char *pOutput = (unsigned char *) Unic;
switch ( utfbytes )
{
case 0:
*pOutput = *pInput;
utfbytes += 1;
break;
case 2:
b1 = *pInput;
b2 = *(pInput + 1);
if ( (b2 & 0xE0) != 0x80 )
return 0;
*pOutput = (b1 << 6) + (b2 & 0x3F);
*(pOutput+1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *pInput;
b2 = *(pInput + 1);
b3 = *(pInput + 2);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )
return 0;
*pOutput = (b2 << 6) + (b3 & 0x3F);
*(pOutput+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
case 4:
b1 = *pInput;
b2 = *(pInput + 1);
b3 = *(pInput + 2);
b4 = *(pInput + 3);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) )
return 0;
*pOutput = (b3 << 6) + (b4 & 0x3F);
*(pOutput+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);
*(pOutput+2) = ((b1 << 2) & 0x1C) + ((b2 >> 4) & 0x03);
break;
case 5:
b1 = *pInput;
b2 = *(pInput + 1);
b3 = *(pInput + 2);
b4 = *(pInput + 3);
b5 = *(pInput + 4);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )
return 0;
*pOutput = (b4 << 6) + (b5 & 0x3F);
*(pOutput+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);
*(pOutput+2) = (b2 << 2) + ((b3 >> 4) & 0x03);
*(pOutput+3) = (b1 << 6);
break;
case 6:
b1 = *pInput;
b2 = *(pInput + 1);
b3 = *(pInput + 2);
b4 = *(pInput + 3);
b5 = *(pInput + 4);
b6 = *(pInput + 5);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)
|| ((b6 & 0xC0) != 0x80) )
return 0;
*pOutput = (b5 << 6) + (b6 & 0x3F);
*(pOutput+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);
*(pOutput+2) = (b3 << 2) + ((b4 >> 4) & 0x03);
*(pOutput+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);
break;
default:
*Unic = *pInput;
//return 0;
break;
}
return utfbytes;
}
//将一个字符的Unicode(UCS-2和UCS-4)编码转换成UTF-8编码.
/*****************************************************************************
* 将一个字符的Unicode(UCS-2和UCS-4)编码转换成UTF-8编码.
*
* 参数:
* unic 字符的Unicode编码值
* pOutput 指向输出的用于存储UTF8编码值的缓冲区的指针
* outsize pOutput缓冲的大小
*
* 返回值:
* 返回转换后的字符的UTF8编码所占的字节数, 如果出错则返回 0 .
*
* 注意:
* 1. UTF8没有字节序问题, 但是Unicode有字节序要求;
* 字节序分为大端(Big Endian)和小端(Little Endian)两种;
* 在Intel处理器中采用小端法表示, 在此采用小端法表示. (低地址存低位)
* 2. 请保证 pOutput 缓冲区有最少有 6 字节的空间大小!
****************************************************************************/
int unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize)
{
if(pOutput == NULL || outSize < 6)
return 0;
if ( unic <= 0x0000007F )
{
// * U-00000000 - U-0000007F: 0xxxxxxx
*pOutput = (unsigned char)(unic & 0x7F);
return 1;
}
else if ( unic >= 0x00000080 && unic <= 0x000007FF )
{
// * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
*(pOutput+1) = (unsigned char)(unic & 0x3F) | 0x80;
*pOutput = (unsigned char)((unic >> 6) & 0x1F) | 0xC0;
return 2;
}
else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
{
// * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
*(pOutput+2) = (unsigned char)(unic & 0x3F) | 0x80;
*(pOutput+1) = (unsigned char)((unic >> 6) & 0x3F) | 0x80;
*pOutput = (unsigned char)((unic >> 12) & 0x0F) | 0xE0;
return 3;
}
else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
{
// * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+3) = (unsigned char)(unic & 0x3F) | 0x80;
*(pOutput+2) = (unsigned char)((unic >> 6) & 0x3F) | 0x80;
*(pOutput+1) = (unsigned char)((unic >> 12) & 0x3F) | 0x80;
*pOutput = (unsigned char)((unic >> 18) & 0x07) | 0xF0;
return 4;
}
else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
{
// * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+4) = (unsigned char)(unic & 0x3F) | 0x80;
*(pOutput+3) = (unsigned char)((unic >> 6) & 0x3F) | 0x80;
*(pOutput+2) = (unsigned char)((unic >> 12) & 0x3F) | 0x80;
*(pOutput+1) = (unsigned char)((unic >> 18) & 0x3F) | 0x80;
*pOutput = (unsigned char)((unic >> 24) & 0x03) | 0xF8;
return 5;
}
else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
{
// * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+5) = (unsigned char)(unic & 0x3F) | 0x80;
*(pOutput+4) = (unsigned char)((unic >> 6) & 0x3F) | 0x80;
*(pOutput+3) = (unsigned char)((unic >> 12) & 0x3F) | 0x80;
*(pOutput+2) = (unsigned char)((unic >> 18) & 0x3F) | 0x80;
*(pOutput+1) = (unsigned char)((unic >> 24) & 0x3F) | 0x80;
*pOutput = (unsigned char)((unic >> 30) & 0x01) | 0xFC;
return 6;
}
return 0;
}
// UTF_8串转Unicode字符
/*****************************************************************************
* 将UTF8编码字节流转换成Unicode(UCS-2和UCS-4)编码数据.
*
* 参数:
* pUtf8 指向输入缓冲区, 以UTF-8编码
* pWchar 指向输出缓冲区, 其保存的数据即是Unicode编码值,为空返回所需空间大小
*
* 返回值:
* 成功则返回该字符的UTF8编码所占用的字节数; 失败则返回0.
*
* 注意:
****************************************************************************/
int WINAPI UTF8ToUnicode(unsigned char* pUtf8, wchar_t* pWchar)
{
int nIndex = 0;
int nCount = 0;
unsigned char* p= pUtf8;
wchar_t* pDes = pWchar;
unsigned long nWchar = 0;
while(*p != 0x00)
{
nIndex = utf8_to_unicode_one(p, &nWchar);
p += nIndex;
if(pWchar)
{
*pDes = (unsigned short)nWchar;
pDes++;
}
nCount++;
}
if(pWchar)*pDes = 0x00;
return nCount;
}
// Unicode字符串转UTF_8
/*****************************************************************************
* 将Unicode(UCS-2和UCS-4)数据转换成的UTF8编码字节流.
*
* 参数:
* pWchar 指向输入缓冲区, 以Unicode编码
* pDes 指向输出缓冲区, 其保存的数据即是UTF-8编码值,为空则返回所需空间大小
*
* 返回值:
* 成功则返回该字符的UTF8编码所占用的字节数; 失败则返回0.
*
****************************************************************************/
int WINAPI UnicodeToUTF8(wchar_t* pWchar, char* pDes)
{
int nIndex = 0;
wchar_t* p = pWchar;
while(*p != 0x00)
{
nIndex += unicode_to_utf8_one((unsigned short)*p, (unsigned char*)pDes+nIndex, 6);
p++;
}
pDes[nIndex] = 0x00;
return nIndex;
}
感谢”TensorFlow“的分享:https://blog.csdn.net/chary8088/article/details/21226375