c++中gbk转utf8时,如果汉字数量为奇数时,最后一个汉字会出现乱码的情况。
原因:
char *strOut = new char[nLength+1]; 申请的长度是不够的,如上面“天安门”是3个汉字,nLength为3。但UTF-8格式一个汉字是占三个字符,至少申请10位(3*3+1)。2、奇数个汉字转码后,再由UTF-8转成GBK时,最后一个字符一直显示为“?”。因为一个汉字转成UTF-8是需要3个字节,3个汉字就成了9个字节,而它会2个字节2个字节地转换成字符,当字节是奇数时最后1个字节转字符就会计算错误,然后直接赋予最后这个字符为“?”,这样改变了数据,影响后面的解码。
解决方案与示例:
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#include <sstream>
#include <fstream>
#include <stdio.h>
#include <iostream>
//GBK编码转换到UTF8编码
using namespace std;
int GBKToUTF8(unsigned char * lpGBKStr, unsigned char * lpUTF8Str, int nUTF8StrLen)
{
wchar_t * lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpGBKStr) //如果GBK字符串为NULL则出错退出
return 0;
nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, NULL, NULL); //获取转换到Unicode编码后所需要的字符空间长度
lpUnicodeStr = new WCHAR[nRetLen + 1]; //为Unicode字符串空间
nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, lpUnicodeStr, nRetLen); //转换到Unicode编码
if (!nRetLen) //转换失败则出错退出
return 0;
nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, NULL, 0, NULL, NULL); //获取转换到UTF8编码后所需要的字符空间长度
if (!lpUTF8Str) //输出缓冲区为空则返回转换后需要的空间大小
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
if (nUTF8StrLen < nRetLen) //如果输出缓冲区长度不够则退出
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, (char *)lpUTF8Str, nUTF8StrLen, NULL, NULL); //转换到UTF8编码
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
// UTF8编码转换到GBK编码
int UTF8ToGBK(unsigned char * lpUTF8Str, unsigned char * lpGBKStr, int nGBKStrLen)
{
wchar_t * lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpUTF8Str) //如果UTF8字符串为NULL则出错退出
return 0;
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, NULL, NULL); //获取转换到Unicode编码后所需要的字符空间长度
lpUnicodeStr = new WCHAR[nRetLen + 1]; //为Unicode字符串空间
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, lpUnicodeStr, nRetLen); //转换到Unicode编码
if (!nRetLen) //转换失败则出错退出
return 0;
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, NULL, NULL, NULL, NULL); //获取转换到GBK编码后所需要的字符空间长度
if (!lpGBKStr) //输出缓冲区为空则返回转换后需要的空间大小
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
if (nGBKStrLen < nRetLen) //如果输出缓冲区长度不够则退出
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, (char *)lpGBKStr, nRetLen, NULL, NULL); //转换到GBK编码
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
//使用这两个函数的例子
int main()
{
char cGBKStr[] = "奇数个汉字";
char * lpGBKStr = NULL;
char * lpUTF8Str = NULL;
FILE * fp = NULL;
int nRetLen = 0;
nRetLen = GBKToUTF8((unsigned char *)cGBKStr, NULL, NULL);
printf("转换后的字符串需要的空间长度为:%d ", nRetLen);
lpUTF8Str = new char[nRetLen + 1];
nRetLen = GBKToUTF8((unsigned char *)cGBKStr, (unsigned char *)lpUTF8Str, nRetLen);
if (nRetLen)
{
printf("GBKToUTF8转换成功!");
}
else
{
printf("GBKToUTF8转换失败!");
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}
ofstream out("result.txt");
if (out.is_open())
{
out << lpUTF8Str;
out.close();
}
nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, NULL, NULL); //再转回来
printf("转换后的字符串需要的空间长度为:%d ", nRetLen);
lpGBKStr = new char[nRetLen + 1];
nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, (unsigned char *)lpGBKStr, nRetLen);
if (nRetLen)
{
printf("UTF8ToGBK转换成功! ");
}
else
{
printf("UTF8ToGBK转换失败! ");
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}
ofstream out1("result.txt");
if (out1.is_open())
{
out1 << lpGBKStr;
out1.close();
}
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}