真正UTF-8与GB2312间的转换(兼容windows和Linux)

最新推荐文章于 2024-08-05 21:38:51 发布

alexaroma

最新推荐文章于 2024-08-05 21:38:51 发布

阅读量3.6k

点赞数

文章标签： windows linux delete null class

本文链接：https://blog.csdn.net/alexaroma/article/details/2429724

版权

                                                       UTF-8与GB2312间的转换
                                                               作者：曾刘彬
中文转码是程序员可能经常遇到的一个问题，鄙人在这方面有些心得，故在此抛砖引玉了。
    我在网上看到好多关于UTF-8与BG2312间互相转换都用一下的方法（为了方便，以后我称之为“拼凑法”）：
// 把UTF-8转换成Unicode
      void CChineseCodeLib::UTF_8ToUnicode(WCHAR* pOut,char *pText)
      {
          char* uchar = (char *)pOut;

          uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
          uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);

          return;
      }
// Unicode 转换成UTF-8
void CChineseCodeLib::UnicodeToUTF_8(char* pOut,WCHAR* pText)
{
     // 注意 WCHAR高低字的顺序,低字节在前，高字节在后
      char* pchar = (char *)pText;

     pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
     pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
     pOut[2] = (0x80 | (pchar[0] & 0x3F));
     return;
}
    “拼凑法”可以成功的转换大部分的UTF-8编码，不过作为一个负责的人，我想指出其中的缺陷：
    真正的UTF-8的编码规则如下：
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

       可见UTF-8 编码字符理论上可以是1 - 6 个字节长，而“拼凑法”只处理了1字节和3字节两种编码方式。有的人可能会说：16 位 BMP 字符最多只用到 3 字节长。这没错，拼凑发也没处理2字节编码方式。所以我得出结论：“拼凑法”是不安全的。
        言归正传，其实UTF-8与GB2312之间的转化不用弄得那么复杂。
        在windows下用这两个现成的函数，通过不同的参数就能实现。
            ::WideCharToMultiByte（。。。）
            ::MultiByteToWideChar（。。。）
       若在Linux下，则用iconv命令实现。具体代码如下：
        如果是windows下，则：#define __cdn_win32_platform__
#define CP_GB2312 20936
class CodingTransformer {
public:
//GB2312 转为 UTF-8
int UTF_8ToGB2312(char* pOut, int iBufSize, char *pText, int iLenth);
//GB2312 转为 UTF-8
int GB2312ToUTF_8(char* pOut, int iBufSize,char *pText, int iLenth);
};
int CodingTransformer::UTF_8ToGB2312(char* pOut, int iBufSize, char *pText, int pLen)
{
#ifdef __cdn_win32_platform__
WCHAR* pWtemp = new WCHAR[pLen];
int iWcharLenth = ::MultiByteToWideChar(CP_UTF8,0,pText,-1,pWtemp,pLen);
if (0 == iWcharLenth)
{
  DWORD dwLastErr = GetLastError();
  printf("alexaroma:转码错误，错误号:%d/n", dwLastErr);
  delete pWtemp;
  return 0;
}
int iMultByteLenth = ::WideCharToMultiByte(CP_GB2312, NULL, pWtemp,
                                         iWcharLenth, pOut,
              iBufSize, NULL, NULL);
if (0 == iMultByteLenth)
{
  DWORD dwLastErr = GetLastError();
  printf("alexaroma:转码错误，错误号:%d/n", dwLastErr);
  delete pWtemp;
  return 0;
}
delete pWtemp;
return iMultByteLenth;
#else//__cdn_win32_platform__
iconv_t cd;
int rc;
char   **pin   =   &pText;
char   **pout   =   &pOut;
int outlen = iBufSize;
cd   =   iconv_open("GB2312","UTF-8");
if(cd==0)return -1;
memset(pOut,0,strlen(pOut));
if(iconv(cd,pin,(size_t   *)&pLen,pout,(size_t   *)&outlen) == -1)
{
  iconv_close(cd);
  return outlen;
}
iconv_close(cd);
return -1;
#endif//__cdn_win32_platform__
}
int CodingTransformer::GB2312ToUTF_8(char* pOut, int iBufSize,
           char *pText, int pLen)
{
#ifdef __cdn_win32_platform__
WCHAR* pWtemp = new WCHAR[pLen];
int iWcharLenth = ::MultiByteToWideChar(CP_GB2312, MB_PRECOMPOSED,
                                      pText, -1, pWtemp, pLen);
if (0 == iWcharLenth)
{
  DWORD dwLastErr = GetLastError();
     printf("alexaroma:转码错误，错误号:%d/n", dwLastErr);
  delete pWtemp; return 0;
}
int iMultByteLenth = ::WideCharToMultiByte(CP_UTF8, 0, pWtemp,
                                         iWcharLenth, pOut,
              iBufSize, NULL, NULL);
if (0 == iMultByteLenth)
{
  DWORD dwLastErr = GetLastError();
  printf("alexaroma:转码错误，错误号:%d/n", dwLastErr);
  delete pWtemp;
  return 0;
}
delete pWtemp;
return iMultByteLenth;
#else//__cdn_win32_platform__
iconv_t cd;
int   rc;
char   **pin   =   &pText;
char   **pout   =   &pOut;
int outlen = iBufSize;
cd=iconv_open("UTF-8","GB2312");
if(cd==0)return -1;
memset(pOut,0,strlen(pOut));
if(iconv(cd,pin,(size_t   *)&pLen,pout,(size_t   *)&outlen) == -1)
{
  iconv_close(cd);
  return outlen;
}
iconv_close(cd);
return -1;
#endif//__cdn_win32_platform__
}