UTF-8 转换 Unicode代码C++:
/*************************************************************************************
Unicode <- UTF-8
(U-0000 ~ U-007F) 00000000 0xxxxxxx <- 0xxxxxxx
(U-0080 ~ U-07FF) 00000xxx xxxxxxxx <- 110xxxxx 10xxxxxx
(U-0800 ~ U-FFFF) xxxxxxxx xxxxxxxx <- 1110xxxx 10xxxxxx 10xxxxxx
*************************************************************************************/
int IsLittleEndian()
{
unsigned short wTest = 1;
unsigned char byTest = *(unsigned char*)(&wTest);
return (byTest == 1);
}
int Utf82Unicode( const char *pchStrIn, char *pchStrOut, unsigned int dwOutBufLen, unsigned int &dwOutLen )
{
dwOutLen = 0;
if (NULL == pchStrIn)
{
return -1;
}
char *pchSrc = (char*)pchStrIn;
unsigned int dwTmpLen = strlen(pchStrIn) * 2 + 1;
char *pchTmpBuf = (char*)malloc(dwTmpLen);
if (NULL == pchTmpBuf)
{
return -1;
}
char *pchDst = pchTmpBuf;
memset(pchDst, 0, dwTmpLen);
int bLittleEndian = IsLittleEndian();
while (*pchSrc != '\0')
{
unsigned char byHigh = 0;
unsigned char byLow = 0;
if (*pchSrc > 0x00 && *pchSrc <= 0x7F) //单字节UTF8字符(英文字母、数字)
{
byLow = *pchSrc;
pchSrc++;
}
else if ( ( (*pchSrc) & 0xE0 ) == 0xC0 ) //双字节UTF8字符(英文字母、数字)
{
unsigned char byFst = *pchSrc;
unsigned char bySec = *(pchSrc + 1);
byLow = (0x3f & bySec) | (byFst << 6);
byHigh = 0x07 & (byFst >> 2);
pchSrc += 2;
}
else if ( ( (*pchSrc) & 0xF0) == 0xE0 ) //三字节UTF8字符
{
unsigned char byFst = *pchSrc;
unsigned char bySec = *(pchSrc + 1);
unsigned char byTrd = *(pchSrc + 2);
byLow = (0x3f & byTrd) | (bySec << 6);
byHigh = (byFst << 4) | (bySec >> 2 & 0x0f);
pchSrc += 3;
}
else //其他字节数的utf-8不识别
{
dwOutLen = 0;
free(pchTmpBuf);
return -1;
}
if (bLittleEndian)
{
*pchDst = byLow;
*(pchDst + 1) = byHigh;
}
else
{
*pchDst = byHigh;
*(pchDst + 1) = byLow;
}
pchDst += 2;
dwOutLen += 2;
}
if (NULL == pchStrOut || dwOutLen > dwOutBufLen)
{
free(pchTmpBuf);
return -1; //调用者提供的缓冲区大小不足
}
memcpy((void*)pchStrOut, (void*)pchTmpBuf, dwOutLen);
free(pchTmpBuf);
return 0;
}