上篇主要介绍了3中字符编码的历史、具体编码规则以及相互关联关系,下面介绍如何进行转换。
下面这个函数实现的是GBK与UTF8之间的转换,支持Windows、Linux平台。
#define ENCODING_GBK 0 // Same with CP_ACP(0), default to ANSI code page
#define ENCODING_UTF8 65001 // Same with CP_UTF8(65001), UTF-8 translation
std::string convertStringEncoding(const char* source, uint32_t srcFormat, uint32_t dstFormat)
{
if(source == NULL)
{
return NULL;
}
#if (defined(_WIN32)
// Get the converted length
int numWLen = MultiByteToWideChar(srcFormat, 0, source, -1, NULL, 0);
wchar_t *pwBuf = new wchar_t[numWLen + 1];
wmemset(pwBuf, 0, (numWLen + 1));
// Converte GBK to wchar format
int result1 = MultiByteToWideChar(srcFormat, 0, source, (size_t)strlen(source), (LPWSTR)pwBuf, numWLen);
// Get the converted length
int numcLen = WideCharToMultiByte(dstFormat, 0, pwBuf, -1, NULL, 0, NULL, FALSE);
char* pcBuf = new char[numcLen + 1];
memset(pcBuf, 0, numcLen+1);
// Converte wchar to GBK format
int result2 = WideCharToMultiByte(dstFormat, 0, pwBuf, numWLen, (LPSTR)pcBuf, numcLen, NULL, FALSE);
std::string destination(pcBuf);
delete[] pwBuf;
delete[] pcBuf;
if(result1 == 0 || result2 == 0)
return std::string(source);
return destination;
#else // linux
std::string destination("");
size_t srcLen = strlen(source);
if(srcLen == 0)
return destination;
iconv_t cd;
/* Target encoding: For characters that cannot be converted
* TRANSLIT:find similar characters to replace
* IGNORE :ignor it*/
if(srcFormat == ENCODING_GBK)
{
// from GBK to UTF-8
cd = iconv_open("UTF-8//IGNORE", "GBK");
}
else
{
// from UTF-8 to GBK
cd = iconv_open("GBK//IGNORE", "UTF-8");
}
if(cd == (iconv_t)-1)
return destination;
char *pcSrc = const_cast<char*>(source);
size_t dstLen = 2*srcLen + 2;
char *pcDst = new char[dstLen];
memset(pcDst, 0, dstLen);
char *pcTemp = pcDst;
iconv(cd, &pcSrc, &srcLen, &pcTemp, &dstLen);
iconv_close(cd);
destination = pcDst;
delete []pcDst;
return destination;
#endif
}