Unicode转义序列转utf8

我在编程迷了路

已于 2024-08-28 17:16:24 修改

阅读量685

点赞数 3

文章标签：算法 c++ windows

于 2024-04-08 18:08:23 首次发布

本文链接：https://blog.csdn.net/qq_43179054/article/details/137518373

版权

本文介绍了一个将Unicode转义序列转换为UTF-8编码的实用函数。该函数能够处理多种Unicode码点，并将其正确地转换为对应的UTF-8字节序列。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

以下unicodeToUtf8函数将Unicode转义序列转换为UTF-8编码的字符串。它遍历输入的Unicode字符串，检查是否遇到转义序列（以\u开头）。如果是转义序列，则解析出对应的Unicode码点，并根据UTF-8编码规则将码点转换为UTF-8字节序列。最后，将转换后的UTF-8字节序列拼接到结果字符串中。

std::string unicodeToUtf8(const std::string& strUnicode) 
{
    std::string strutf8;
    size_t pos = 0;
    int nLen = strUnicode.length();
    while (pos < nLen) 
    {
        if (pos < (nLen - 2) && strUnicode[pos] == '\\' && strUnicode[pos + 1] == '\\' && strUnicode[pos + 2] == 'u') 
        {
            std::string hexCode = strUnicode.substr(pos + 3, 4);
            int code = hexToDec(hexCode);
            if (code <= -1)
            {
                char ch = strUnicode[pos];
                strutf8.append(1, ch);
                continue;
            }

            string strTemp;
            if (code <= 0x7f) 
            {
                strTemp += static_cast<char>(code);
            } 
            else if (code <= 0x7ff) 
            {
                strTemp += static_cast<char>((code >> 6) | 0xc0);
                strTemp += static_cast<char>((code & 0x3f) | 0x80);
            } 
            else if (code <= 0xffff) 
            {
                strTemp += static_cast<char>((code >> 12) | 0xe0);
                strTemp += static_cast<char>(((code >> 6) & 0x3f) | 0x80);
                strTemp += static_cast<char>((code & 0x3f) | 0x80);
            } 
            else if (code <= 0x10ffff) 
            {
                strTemp += static_cast<char>((code >> 18) | 0xf0);
                strTemp += static_cast<char>(((code >> 12) & 0x3f) | 0x80);
                strTemp += static_cast<char>(((code >> 6) & 0x3f) | 0x80);
                strTemp += static_cast<char>((code & 0x3f) | 0x80);
            }
            strutf8.append(strTemp);
            pos += 7;
        } 
        else 
        {
            char ch = strUnicode[pos];
            strutf8.append(1, ch);
            pos++;
        }
    }
    return strutf8;
}

int main()
{
    CString strJson = L"{\\\\u6587.exe\\\\u0000\\\\u6587.exe\\\\u005c\\\\u0022\\\\u000a\\\\u0027"};
    strJson.Replace(L"\\\\u0000", L"");
    std::string unicodeStr = CW2A(strJson, CP_UTF8); // "中文"的Unicode转义序列

    std::string chineseStr = unicodeToUtf8(unicodeStr);
    std::cout << chineseStr << std::endl; // 输出中文字符
    CString strDec;
    strDec = CA2W(chineseStr.c_str(), CP_UTF8);
    return 0;
}

第二种方法

string jsonString = "\\u6211\\u662f\\u5c71";
ConvertUnicodeJsonToChinese(jsonString);
string ConvertUnicodeJsonToChinese(string jsonString)
{
    string decodedString = Regex.Unescape(jsonString);
    return decodedString;
}