前提:
- 案例为C++语言
- 适用于将形如:&#、&#x等开头的字符串,转换为中文显示
- 如有问题欢迎评论沟通。
说明
- 经过查阅资料,发现以上所说字符是 HTML、XML 等 SGML 类语言的转义序列(escape sequence)。它们不是编码。&#跟的是十进制,&#x跟的是十六进制。他们有一个专业名词为:NCR(numeric character reference)。
- 转码思路说明:十六进制—>十进制—>UTF-8—>中文
代码
#include <sstream>
#include <iostream>
#include <string>
#include <iconv.h>
using namespace std;
strTrans(string strSource)
{
string result = "";
cout<<"转化前的字符串为 : %s"<<strSource.c_str()<<endl;
//将字符串截取出来,存入容器
vector<string> utf16Vec =getTokenList(strSource,";");
ostringstream oss;
for(vector<string>::iterator it = utf16Vec.begin();it!= utf16Vec.end();it++)
{
string strUTFNCR = *it;
string::size_type pos = strUTFNCR.find("&#x");
if (pos!=string::npos)
{
string uft16Before = strUTFNCR.substr(0,pos);
string uft16 = strUTFNCR.substr(pos+3,4);
string uft16After = strUTFNCR.substr(pos+7);
char dest1[5];
memset(dest1, 0, 5);
string utfFirstByte = uft16.substr(0,2);
string utfLastByte = uft16.substr(2);
dest1[0] = htoi(utfFirstByte.c_str());
dest1[1] = htoi(utfLastByte.c_str());
string strGBKWord;
CSConvert("UTF-16",dest1,sizeof(dest1),"GB18030",strGBKWord);
strUTFNCR = uft16Before + strGBKWord +uft16After;
*it = strUTFNCR;
}
oss << *it;
}
result = oss.str();
cout<<"转化后的字符串为 : %s"<<result.c_str()<<endl;
return result;
}
/*result为转换后中文格式的字符串*/
//进制转换
htoi(const char s[])
{
int i;
int n = 0;
if (s[0] == '0' && (s[1]=='x' || s[1]=='X')) //判断是否有前导0x或者0X
{
i = 2;
}
else
{
i = 0;
}
for (; (s[i] >= '0' && s[i] <= '9') || (s[i] >= 'a' && s[i] <= 'z') || (s[i] >='A' && s[i] <= 'Z');++i)
{
if (tolower(s[i]) > '9')
{
n = 16 * n + (10 + tolower(s[i]) - 'a');
}
else
{
n = 16 * n + (tolower(s[i]) - '0');
}
}
return n;
}
//截取字符串的函数
vector<string> getTokenList(const string& val, const string& token)
{
vector<string> slist;
string tmp = val;
int pos = 0;
while(pos != string::npos)
{
pos = tmp.find(token);
if (pos != string::npos)
{
if (pos!=0)
{
slist.push_back(tmp.substr(0,pos));
}
tmp = tmp.substr(pos + token.length(), tmp.length()-pos-token.length());
}
}
if (tmp.length()>0)
{
slist.push_back(tmp);//push the last one
}
return slist;
}
//编码转换方法
void CSConvert(string strSourceCS /*"UTF-8"*/,const char* pSourceBuffer,int iSourceLen,string strTargetCS/*"GB2312"*/,string& strTarget)
{
UErrorCode status = U_ZERO_ERROR;
UChar target[iSourceLen*2];
UConverter *conv;
int32_t len;
//1 convert strSourceCS string to Unicode
// set up the converter
conv = ucnv_open(strSourceCS.c_str(), &status);
assert(U_SUCCESS(status));
// convert to Unicode
len = ucnv_toUChars(conv, target, iSourceLen*2, pSourceBuffer, iSourceLen, &status);
assert(U_SUCCESS(status));
// close the converter
ucnv_close(conv);
//2 convert Unicode string to strTargetCS
// set up the converter
conv = ucnv_open(strTargetCS.c_str(), &status);
assert(U_SUCCESS(status));
// convert to strTargetCS
char gbTarget[iSourceLen*2];
len = ucnv_fromUChars(conv, gbTarget, iSourceLen*2, target, -1, &status);
assert(U_SUCCESS(status));
// close the converter
ucnv_close(conv);
strTarget = gbTarget;
return ;
}
这样就可以调用了,如下:
int main()
{
string strSource = "用户";
string strTar = strTrans(strSource);
cout<<"转换后的字符串为:"<<strTar<<endl;
return 0;
}
代码就是这样了,供学习交流