前言:
1、在windows下进行编程的时候一般使用的都是unicode编码,所以我们在进行操作的时候正常情况下都会把获取的字符串转换成unicode编码在进行操作;
2、当我们根据url获取一串字符串的时候一般进行解析完都是ANSI编码,所以这种的也需要转换成unicode编码再进行操作
一、GBK -> UTF-8
void GB2312ToUTF_8(string& pOut, char *pText, int pLen)
{
char buf[4];
memset(buf, 0, 4);
pOut.clear();
int i = 0;
while (i < pLen)
{
//如果是英文直接复制就可以
if (pText[i] >= 0)
{
char asciistr[2] = { 0 };
asciistr[0] = (pText[i++]);
pOut.append(asciistr);
}
else
{
WCHAR pbuffer;
Gb2312ToUnicode(&pbuffer, pText + i);
UnicodeToUTF_8(buf, &pbuffer);
pOut.append(buf);
i += 3;
}
}
return;
}
二、UTF-8 -> GBK
void UTF_8ToGB2312(string &pOut, char *pText, int pLen)
{
char buf[4];
char* rst = new char[pLen + (pLen >> 2) + 2];
memset(buf, 0, 4);
memset(rst, 0, pLen + (pLen >> 2) + 2);
int i = 0;
int j = 0;
while (i < pLen)
{
if (*(pText + i) >= 0)
{
rst[j++] = pText[i++];
}
else
{
WCHAR Wtemp;
UTF_8ToUnicode(&Wtemp, pText + i);
UnicodeToGB2312(buf, Wtemp);
unsigned short int tmp = 0;
tmp = rst[j] = buf[0];
tmp = rst[j + 1] = buf[1];
tmp = rst[j + 2] = buf[2];
//newBuf[j] = Ctemp[0];
//newBuf[j + 1] = Ctemp[1];
i += 3;
j += 3;
}
}
rst[j] = '\0';
pOut = rst;
delete[]rst;
}
三、Unicode(wchar)-> UTF-8
1、
const char* WcharToUtf8(const wchar_t *pwStr)
{
if (pwStr == NULL)
{
return NULL;
}
int len = WideCharToMultiByte(CP_UTF8, 0, pwStr, -1, NULL, 0, NULL, NULL);
if (len <= 0)
{
return NULL;
}
char *pStr = new char[len];
WideCharToMultiByte(CP_UTF8, 0, pwStr, -1, pStr, len, NULL, NULL);
return pStr;
}
2、
void UnicodeToUTF_8(char* pOut, WCHAR* pText)
{
// 注意 WCHAR高低字的顺序,低字节在前,高字节在后
char* pchar = (char *)pText;
pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));
return;
}
四、UTF-8 -> Unicode(wchar)
1、
const wchar_t* Utf8ToWchar(const char *pStr)
{
if (pStr == NULL)
{
return NULL;
}
int len = MultiByteToWideChar(CP_UTF8, 0, pStr, -1, NULL, 0);
if (len <= 0)
{
return NULL;
}
wchar_t *pwStr = new wchar_t[len];
MultiByteToWideChar(CP_UTF8, 0, pStr, -1, pwStr, len);
return pwStr;
}
2、
void UTF_8ToUnicode(WCHAR* pOut, char *pText)
{
char* uchar = (char *)pOut;
uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
return;
}
五、Ansi -> Unicode(wchar)
std::wstring Ansi2WChar(LPCSTR pszSrc, int nLen)
{
int nSize = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pszSrc, nLen, 0, 0);
if (nSize <= 0) return NULL;
WCHAR *pwszDst = new WCHAR[nSize + 1];
if (NULL == pwszDst)
return NULL;
MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pszSrc, nLen, pwszDst, nSize);
pwszDst[nSize] = 0;
if (pwszDst[0] == 0xFEFF) // skip Oxfeff
for (int i = 0; i < nSize; i++)
pwszDst[i] = pwszDst[i + 1];
wstring wcharString(pwszDst);
delete pwszDst;
return wcharString;
}
六、url转换成utf-8
std::string UrlDecode(const std::string& szToDecode)
{
std::string result;
size_t nsumlen = szToDecode.length();
int hex = 0;
for (size_t i = 0; i < nsumlen; ++i)
{
switch (szToDecode[i])
{
case '%':
// 2 char after % at least
if (i + 1 < nsumlen && i + 2 < nsumlen) {
if (isxdigit(szToDecode[i + 1]) && isxdigit(szToDecode[i + 2]))
{
std::string hexStr = szToDecode.substr(i + 1, 2);
hex = strtol(hexStr.c_str(), 0, 16);
if (!((hex >= 48 && hex <= 57) || //0-9
(hex >= 97 && hex <= 122) || //a-z
(hex >= 65 && hex <= 90) || //A-Z
//[$-_.+!*'(),] [$&+,/:;=?@]
hex == 0x21 || hex == 0x24 || hex == 0x26 || hex == 0x27 || hex == 0x28 || hex == 0x29
|| hex == 0x2a || hex == 0x2b || hex == 0x2c || hex == 0x2d || hex == 0x2e || hex == 0x2f
|| hex == 0x3A || hex == 0x3B || hex == 0x3D || hex == 0x3f || hex == 0x40 || hex == 0x5f
))
{
result += char(hex);
i += 2;
}
else {
result += '%';
}
}
else {
result += '%';
}
}
// if no 2 char after %
else {
result += szToDecode[i];
}
break;
default:
result += szToDecode[i];
break;
}
}
return result;
}