wchar_t 、UTF-8、UTF-16的转换方法

dolphin98629

已于 2022-02-28 13:55:04 修改

阅读量3.6k

点赞数

分类专栏：编码与加密文章标签： gnu windows c++

于 2022-02-25 15:26:58 首次发布

原文链接：https://blog.csdn.net/gwj239/article/details/9120489

版权

编码与加密专栏收录该内容

8 篇文章 0 订阅

订阅专栏

Unicode 有两套编码集，UCS-2 和 UCS-4。Windows 的内部其实是用的 UCS-2 标准，并用 UTF-16 来实现。而非 Windows 系统大多采用了 UTF-8 实现。

大家都知道在windows上wchar_t是2个字节表示，而在Linux上wchar_t是4个字节表示的。这样写跨平台的程序时，就会不统一。

下面是我用到的几个函数，wchar_t与UTF-8编码之间的转换和UTF-16和wchar_t之间的转换。

#ifdef WINDOWS
#include <windows.h>
#include <stdio.h>
#include <ctype.h>
#else
#include <iconv.h>
#include <wctype.h>
#include <wchar.h>
#include <errno.h>
#endif

//wchar_t转成UTF-8
int FW2UTF8Convert(const wchar_t* a_szSrc, int a_nSrcSize, char* a_szDest, int a_nDestSize)
{
#ifdef WINDOWS
    return WideCharToMultiByte(CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize, NULL, NULL);
#else
    size_t result;
    iconv_t env;
    env = iconv_open("UTF-8", "WCHAR_T");
    if (env == (iconv_t)-1)
    {
        printf("iconv_open WCHAR_T->UTF8 error%s %d/n", strerror(errno), errno);
        return -1;
    }
    result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSrcSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
    if (result == (size_t)-1)
    {
        printf("iconv WCHAR_T->UTF8 error %d/n", errno);
        return -1;
    }
    iconv_close(env);
    return (int)result;
#endif
}

//UTF-8转成wchar_t
int FUTF82WConvert(const char* a_szSrc, wchar_t* a_szDest, int a_nDestSize)
{
#ifdef WINDOWS
    return MultiByteToWideChar(CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize);
#else
    size_t result;
    iconv_t env;
    int size = strlen(a_szSrc) + 1;
    env = iconv_open("WCHAR_T", "UTF-8");
    if (env == (iconv_t)-1)
    {
        printf("iconv_open UTF8->WCHAR_T error %d/n", errno);
        return -1;
    }
    result = iconv(env, (char**)&a_szSrc, (size_t*)&size, (char**)&a_szDest, (size_t*)&a_nDestSize);
    if (result == (size_t)-1)
    {
        printf("iconv UTF8->WCHAR_T error %d/n", errno);
        return -1;
    }
    iconv_close(env);
    return (int)result;
#endif
}

//wchar_t转成utf16
int FW2UConvert(const wchar_t* a_szSrc, int  a_nSize, char* a_szDest, int a_nDestSize)
{
#ifdef WINDOWS
    memcpy_s((wchar_t*)a_szDest, a_nDestSize, a_szSrc, a_nSize);
    return a_nSize;
#else
    size_t result;
    iconv_t env;
    env = iconv_open("UCS-2-INTERNAL", "UCS-4-INTERNAL");
    if (env == (iconv_t)-1)
    {
        printf("iconv_open WCHAR_T->UTF16 error%s %d/n", strerror(errno), errno);
        return -1;
    }
    result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
    if (result == (size_t)-1)
    {
        printf("iconv WCHAR_T->UTF16 error %s %d/n", strerror(errno), errno);
        return -1;
    }
    iconv_close(env);
    return (int)result;
#endif
}

//utf16转成wchar_t
int FU2WConvert(const  char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize)
{
#ifdef WINDOWS
    memcpy_s(a_szDest, a_nDestSize, (const wchar_t*)a_szSrc, a_nSize);
    return a_nSize;
#else
    size_t result;
    iconv_t env;
    env = iconv_open("UCS-4-INTERNAL", "UCS-2-INTERNAL");
    if (env == (iconv_t)-1)
    {
        printf("iconv_open error %d/n", errno);
        return -1;
    }
    result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
    if (result == (size_t)-1)
    {
        printf("UTF16 -> WCHAR_T conv error %d/n", errno);
        return -1;
    }
    iconv_close(env);
    return (int)result;
#endif
}

ps：在Linux上我用的是iconv库.其中wchar_t一般以UCS-4标准。

UCS-4-INTERNAL ，UCS-2-INTERNAL 会根据本机的存储方式(大端、小端)进行处理。

还有UCS-2LE和UCS-2BE 分别代表小端和大端模式。

字符串乱码往往是由于编码不一致或编码没有对应的字符所致，为了能够正常显示字符串，经常会有需要编码转换的需要，为了方便使用这里整理成一个head-only文件，这里提供了char、wchar_t、utf-8之间的转换，在实际的项目中建议使用wchar_t/utf-8，强烈建议使用utf-8。

#pragma once
#include <Windows.h>
#include <string>
#include <vector>
#include <assert.h>
 
/*!
 * 编码转换命名空间
 * 
 */
namespace ZEncode
{
    /*!
     * 窄字节转宽字节
     * 
     * \param str 窄字节
     * \param uCodePage 窄字节编码
     * \return 宽字节
     */
    static std::wstring A2W(const std::string &str, UINT uCodePage)
    {
        int nLength = ::MultiByteToWideChar(uCodePage, 0, str.c_str(), -1, NULL, 0);
        if (0 == nLength)
        {
            throw std::exception("A2W Error");
        }
        std::wstring strW(nLength, L'\0');
        int nResult = ::MultiByteToWideChar(uCodePage, 0, str.c_str(), -1, &strW[0], nLength);
        if (nResult != nLength)
        {
            throw std::exception("A2W Error");
        }
        strW.resize(nLength - 1);
        return strW;
    }
 
    /*!
     * 宽字节转窄字节
     * 
     * \param str 宽字节
     * \param uCodePage 窄字节编码
     * \return 窄字节
     */
    static std::string W2A(const std::wstring &str, UINT uCodePage)
    {
        int nLength = ::WideCharToMultiByte(uCodePage, 0, str.c_str(), -1, NULL, 0, NULL, NULL);
        if (0 == nLength)
        {
            throw std::exception("W2A Error");
        }
        std::string strA(nLength, '\0');
        int nResult = ::WideCharToMultiByte(uCodePage, 0, str.c_str(), -1, &strA[0], nLength, NULL, NULL);
        if (nResult != nLength)
        {
            throw std::exception("W2A Error");
        }
        strA.resize(nLength - 1);
        return strA;
    }
 
    /*!
     * 窄字节转窄字节
     * 
     * \param str 窄字节
     * \param uCodePageFrom 源始字节编码
     * \param uCodePageTo 目标字节编码
     * \return 窄字节
     */
    static std::string A2A(const std::string &str, UINT uCodePageFrom, UINT uCodePageTo)
    {
        return W2A(A2W(str, uCodePageFrom), uCodePageTo);
    }
 
    /*!
     * 检查缓冲区数据是否是UTF-8
     * 
     * \param pBuffer 缓冲区
     * \param size 大小
     * \return 如果是返回true，否则返回false。
     * 
     * \note 返回结果并不一定完全正确，仅作为一个参考用途。
     */
    static bool IsUTF8(const void* pBuffer, size_t size)
    {
        //参考 http://blog.csdn.net/bladeandmaster88/article/details/54767487
        bool bIsUTF8 = true;
        unsigned char* start = (unsigned char*)pBuffer;
        unsigned char* end = (unsigned char*)pBuffer + size;
 
        while (start < end)
        {
            if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符     
            {
                start++;
            }
            else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符     
            {
                bIsUTF8 = false;
                break;
            }
            else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符     
            {
                if (start >= end - 1)
                    break;
 
                if ((start[1] & (0xC0)) != 0x80)
                {
                    bIsUTF8 = false;
                    break;
                }
                start += 2;
            }
            else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符
            {
                if (start >= end - 2)
                    break;
 
                if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
                {
                    bIsUTF8 = false;
                    break;
                }
                start += 3;
            }
            else
            {
                bIsUTF8 = false;
                break;
            }
        }
 
        return bIsUTF8;
    }
 
    /*!
    * 检查字符串是不是UTF-8编码
    *
    * \param str 缓冲区
    * \return 如果是返回true，否则返回false。
    * 
    * \note 返回结果并不一定完全正确，仅作为一个参考用途。
    */
    static bool IsUTF8(const std::string &str)
    {
        return IsUTF8(str.c_str(), str.size());
    }
}
 
//为了方便方便使用定义的宏
#define ANSI_TO_WCHAR(str)         (ZEncode::A2W(str, CP_ACP))
#define ANSI_TO_UTF8(str)          (ZEncode::A2A(str, CP_ACP, CP_UTF8))
 
#define UTF8_TO_ANSI(str)          (ZEncode::A2A(str, CP_UTF8, CP_ACP))
#define UTF8_TO_WCHAR(str)         (ZEncode::A2W(str, CP_UTF8))
 
#define WCHAR_TO_ANSI(str)         (ZEncode::W2A(str, CP_ACP))
#define WCHAR_TO_UTF8(str)         (ZEncode::W2A(str, CP_UTF8))


    TEST(ZEncode, ansi_to_wchar_to_ansi)
    {
        std::string strANSI("中华人民共和国");
        std::wstring strWChar = ANSI_TO_WCHAR(strANSI);
        EXPECT_STREQ(strWChar.c_str(), L"中华人民共和国");
        EXPECT_STREQ(WCHAR_TO_ANSI(strWChar).c_str(), "中华人民共和国");
    }
 
    TEST(ZEncode, ansi_to_utf8_to_wchar)
    {
        std::string strANSI("中华人民共和国");
        std::string strUTF8 = ANSI_TO_UTF8(strANSI);
        EXPECT_FALSE(ZEncode::IsUTF8(strANSI));
        EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
        EXPECT_STREQ(UTF8_TO_WCHAR(strUTF8).c_str(), L"中华人民共和国");
    }
 
    TEST(ZEncode, ansi_to_utf8_to_ansi)
    {
        std::string strANSI("中华人民共和国");
        std::string strUTF8 = ANSI_TO_UTF8(strANSI);
        EXPECT_FALSE(ZEncode::IsUTF8(strANSI));
        EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
        EXPECT_STREQ(UTF8_TO_ANSI(strUTF8).c_str(), strANSI.c_str());
    }
 
    TEST(ZEncode, wchar_to_utf8_to_wchar)
    {
        std::wstring strWChar(L"中华人民共和国");
        std::string strUTF8 = WCHAR_TO_UTF8(strWChar);
        EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
        EXPECT_STREQ(UTF8_TO_WCHAR(strUTF8).c_str(), strWChar.c_str());
    }

其它：在C++11中，如果希望初始化的字符串编码为utf-8，只需要在字符前面新增u8即可，如下：