本文借鉴了“夜已深茶已凉”的文章《C++版本的UnEscape 解析\uxxxx\uxxxx编码字符》(链接:https://www.cnblogs.com/guolongzheng/p/9375956.html)。
原文中有一个未修正的bug:非转义的普通字符串,如字符串中的“2018”、“V“”在`UnEscape()`函数处理后没有输出。
“input”字符串:"\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63"
期望输出:“孕妇装夏装2018新款宽松短袖中长款上衣雪纺V领蝴蝶结连衣裙夏季”
实际输出:“孕妇装夏装新款宽松短袖中长款上衣雪纺领蝴蝶结连衣裙夏季”
我在原作者的代码基础上作出修正,同时作出C代码供大家参考,请多多指教。
C++:
#include <iostream>
#include <string>
#include <codecvt>
#include <locale>
#include <windows.h>
using namespace std;
string Unescape(const string& input) {
wstring wresult;
for (size_t i = 0; i < input.length(); ) {
if (input[i] == '\\' && input[i + 1] == 'u') {
string code = input.substr(i + 2, 4);
wchar_t unicode = stoi(code, nullptr, 16);
wresult += unicode;
i += 6;
} else {
wresult += input[i++];
}
}
wstring_convert<codecvt_utf8<wchar_t>> conv;
string result = conv.to_bytes(wresult);
return result;
}
int main() {
if (GetConsoleOutputCP() != CP_UTF8)
SetConsoleOutputCP(CP_UTF8);
string input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63";
string result = Unescape(input);
if (!result.empty()) {
cout << "原转义字符串:"<< input << "\n\n转义还原后的结果:" << result << endl;
}
return 0;
}
C:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <wchar.h> #include <windows.h> char* Unescape(char* input) { size_t inputLen = strlen(input); wchar_t* wresult = malloc((inputLen + 1) * sizeof(wchar_t)); if (wresult == NULL) { return NULL; } wchar_t* wr = wresult; char* p = input; while (*p) { if (*p == '\\' && *(p + 1) == 'u') { char code[5] = {0}; strncpy(code, p + 2, 4); int unicode = strtol(code, NULL, 16); *wr++ = unicode; p += 6; } else { *wr++ = *p++; } } *wr = L'\0'; int len = WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL); char* result = malloc(len + 1); if (result != NULL) { WideCharToMultiByte(CP_UTF8, 0, wresult, -1, result, len + 1, NULL, NULL); } free(wresult); return result; } int main() { char *input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63"; char* result = Unescape(input); if (GetConsoleOutputCP() != CP_UTF8) SetConsoleOutputCP(CP_UTF8); if (result != NULL) { printf("原转义字符串:%s\n\n转义还原后的结果:%s\n", input, result); free(result); } return 0; }
在Windows命令提示符下代码页设为936(GB-2321)或者65001(UTF-8)运行都正确: