如果您对UTF-8、Unicode、GB2312等还是很陌生的话,请查看 http://www.linuxforum.net/books/UTF-8-Unicode.html ,我这里就不浪费口舌了。下面介绍一下WinAPI的两个函数:WideCharToMultiByte、MultiByteToWideChar。
函数原型:
int WideCharToMultiByte( UINT CodePage, // code page DWORD dwFlags, // performance and mapping flags LPCWSTR lpWideCharStr, // wide-character string int cchWideChar, // number of chars in string LPSTR lpMultiByteStr, // buffer for new string int cbMultiByte, // size of buffer LPCSTR lpDefaultChar, // default for unmappable chars LPBOOL lpUsedDefaultChar // set when default char used ); //将宽字符转换成多个窄字符 int MultiByteToWideChar( UINT CodePage, // code page DWORD dwFlags, // character-type options LPCSTR lpMultiByteStr, // string to map int cbMultiByte, // number of bytes in string LPWSTR lpWideCharStr, // wide-character buffer int cchWideChar // size of buffer );//将多个窄字符转换成宽字符需要用到的一些函数:
CString CXmlProcess::HexToBin(CString string)//将16进制数转换成2进制 { if( string == "0") return "0000"; if( string == "1") return "0001"; if( string == "2") return "0010"; if( string == "3") return "0011"; if( string == "4") return "0100"; if( string == "5") return "0101"; if( string == "6") return "0110"; if( string == "7") return "0111"; if( string == "8") return "1000"; if( string == "9") return "1001"; if( string == "a") return "1010"; if( string == "b") return "1011"; if( string == "c") return "1100"; if( string == "d") return "1101"; if( string == "e") return "1110"; if( string == "f") return "1111"; return ""; } CString CXmlProcess::BinToHex(CString BinString)//将2进制数转换成16进制 { if( BinString == "0000") return "0"; if( BinString == "0001") return "1"; if( BinString == "0010") return "2"; if( BinString == "0011") return "3"; if( BinString == "0100") return "4"; if( BinString == "0101") return "5"; if( BinString == "0110") return "6"; if( BinString == "0111") return "7"; if( BinString == "1000") return "8"; if( BinString == "1001") return "9"; if( BinString == "1010") return "a"; if( BinString == "1011") return "b"; if( BinString == "1100") return "c"; if( BinString == "1101") return "d"; if( BinString == "1110") return "e"; if( BinString == "1111") return "f"; return ""; } int CXmlProcess::BinToInt(CString string)//2进制字符数据转换成10进制整型 { int len =0; int tempInt = 0; int strInt = 0; for(int i =0 ;i < string.GetLength() ;i ++) { tempInt = 1; strInt = (int)string.GetAt(i)-48; for(int k =0 ;k < 7-i ; k++) { tempInt = 2*tempInt; } len += tempInt*strInt; } return len; }UTF-8转换成GB2312先把UTF-8转换成Unicode.然后再把Unicode通过函数WideCharToMultiByte转换成GB2312
WCHAR* CXmlProcess::UTF_8ToUnicode(char *ustart) //把UTF-8转换成Unicode { char char_one; char char_two; char char_three; int Hchar; int Lchar; char uchar[2]; WCHAR *unicode; CString string_one; CString string_two; CString string_three; CString combiString; char_one = *ustart; char_two = *(ustart+1); char_three = *(ustart+2); string_one.Format("%x",char_one); string_two.Format("%x",char_two); string_three.Format("%x",char_three); string_three = string_three.Right(2); string_two = string_two.Right(2); string_one = string_one.Right(2); string_three = HexToBin(string_three.Left(1))+HexToBin(string_three.Right(1)); string_two = HexToBin(string_two.Left(1))+HexToBin(string_two.Right(1)); string_one = HexToBin(string_one.Left(1))+HexToBin(string_one.Right(1)); combiString = string_one +string_two +string_three; combiString = combiString.Right(20); combiString.Delete(4,2); combiString.Delete(10,2); Hchar = BinToInt(combiString.Left(8)); Lchar = BinToInt(combiString.Right(8)); uchar[1] = (char)Hchar; uchar[0] = (char)Lchar; unicode = (WCHAR *)uchar; return unicode; } char * CXmlProcess::UnicodeToGB2312(unsigned short uData) //把Unicode 转换成 GB2312 { char *buffer ; buffer = new char[sizeof(WCHAR)]; WideCharToMultiByte(CP_ACP,NULL,&uData,1,buffer,sizeof(WCHAR),NULL,NULL); return buffer; }
GB2312转换成UTF-8:先把GB2312通过函数MultiByteToWideChar转换成Unicode.然后再把Unicode通过拆开Unicode后拼装成UTF-8。
WCHAR * CXmlProcess::Gb2312ToUnicode(char *gbBuffer) //GB2312 转换成 Unicode { WCHAR *uniChar; uniChar = new WCHAR[1]; ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,uniChar,1); return uniChar; } char * CXmlProcess::UnicodeToUTF_8(WCHAR *UniChar) // Unicode 转换成UTF-8 { char *buffer; CString strOne; CString strTwo; CString strThree; CString strFour; CString strAnd; buffer = new char[3]; int hInt,lInt; hInt = (int)((*UniChar)/256); lInt = (*UniChar)%256; CString string ; string.Format("%x",hInt); strTwo = HexToBin(string.Right(1)); string = string.Left(string.GetLength() - 1); strOne = HexToBin(string.Right(1)); string.Format("%x",lInt); strFour = HexToBin(string.Right(1)); string = string.Left(string.GetLength() -1); strThree = HexToBin(string.Right(1)); strAnd = strOne +strTwo + strThree + strFour; strAnd.Insert(0,"1110"); strAnd.Insert(8,"10"); strAnd.Insert(16,"10"); strOne = strAnd.Left(8); strAnd = strAnd.Right(16); strTwo = strAnd.Left(8); strThree = strAnd.Right(8); *buffer = (char)BinToInt(strOne); buffer[1] = (char)BinToInt(strTwo); buffer[2] = (char)BinToInt(strThree); return buffer; }例子:将GB2312转换成UTF-8的调用:
char * CXmlProcess::translateCharToUTF_8(char *xmlStream, int len) { int newCharLen =0 ; int oldCharLen = 0; int revCharLen = len; char* newCharBuffer; char* finalCharBuffer; char *buffer ; CString string; buffer = new char[sizeof(WCHAR)]; newCharBuffer = new char[int(1.5*revCharLen)];//设置最大的一个缓冲区 while(oldCharLen < revCharLen) { if( *(xmlStream + oldCharLen) >= 0) { *(newCharBuffer+newCharLen) = *(xmlStream +oldCharLen); newCharLen ++; oldCharLen ++; }//如果是英文直接复制就可以 else { WCHAR *pbuffer = this->Gb2312ToUnicode(xmlStream+oldCharLen); buffer = this->UnicodeToUTF_8(pbuffer); *(newCharBuffer+newCharLen) = *buffer; *(newCharBuffer +newCharLen +1) = *(buffer + 1); *(newCharBuffer +newCharLen +2) = *(buffer + 2); newCharLen += 3; oldCharLen += 2; } } newCharBuffer[newCharLen] = ''\0''; CString string1 ; string1.Format("%s",newCharBuffer); finalCharBuffer = new char[newCharLen+1]; memcpy(finalCharBuffer,newCharBuffer,newCharLen+1); return finalCharBuffer; }
程序都非常的简单,由于实在太穷。已经吃了两天的方便面。所以现在头昏,程序的详细说明就不写了。程序员到了像我这样的地步也真是少见。工资低没有办法。哎!!!!
字符串编码转换 GBK to UTF8 (ansi版)
xmwen@126.com
*/
char *gbk2utf8(const char *strGBK){
int len;
wchar_t *strUnicode;
char *strUTF8;
if (!strGBK){return NULL;}
len = MultiByteToWideChar(CP_GBK, 0,strGBK, -1, NULL,0);
if (len <1){return NULL;}
strUnicode = (wchar_t *) malloc(sizeof(wchar_t) * len);
if (!strUnicode){return NULL;}
len = MultiByteToWideChar(CP_GBK, 0, strGBK, -1, strUnicode, len);
if (len<1){free(strUnicode);return NULL;}
len = WideCharToMultiByte(CP_UTF8, 0, strUnicode, -1, NULL, 0, NULL, NULL);
if (len<1){free(strUnicode);return NULL;}
strUTF8 = (char *) malloc(sizeof(char) * len);
if (!strUTF8){free(strUnicode);return NULL;}
len = WideCharToMultiByte (CP_UTF8, 0, strUnicode, -1, strUTF8, len, NULL,NULL);
free(strUnicode);
if (len<1){free(strUTF8);return NULL;}
return strUTF8;
} ( xmwen 发表于 2009-11-3 19:38:00)
[ 原创文档 本文适合中级读者 已阅读34485次 ]
搞笑,这种害人害己的文章还有这么多人访问。
作者光知道 WideCharToMultiByte 可以把 Unicode 转成 GB2312 就不知道也可以把 Unicode 转换为 UTF-8 吗?
其实这是一个很简单的程序,都被作者搞复杂了。
要实现 GB2312 (其实是GBK)转换为 UTF-8 其实很简单,先用 MultiByteToWideChar 把 GB2312 转换为 Unicode,再用 WideCharToMultiByte 把 Unicode 转换为 UTF-8 就可以了。
UTF-8 转换为 GB2312 是个相反的过程,先用 MultiByteToWideChar 把 UTF-8 转换为 Unicode,再用 WideCharToMultiByte 把 Unicode 转换为 GB2312 就可以了。 ( 雁过留声 发表于 2007-1-11 9:11:00)
translateCharToUTF_8的编码不对,
请作者检查一下,
如: "你是我的好朋友"
转换成了;"浣犳槸鎴戠殑濂芥i脲弸鍚?"
正确的应是:
"浣犳槸鎴戠殑濂芥湅鍙嬪悧"
对于有的编码还能对...
交流一下:kudoo.aos@gmail.com
( kudoo 发表于 2006-8-20 19:46:00)
shines在2005-2-6,提供了一段程序,里面有
buffersize = WideCharToMultiByte(CP_UTF8, MB_PRECOMPOSED, unicode, wide_size, NULL, 0, NULL, 0);
buffer = new char[buffersize+1];
但是,我在调试的时候发现:buffersize似乎已经预先留了‘\0’的位置,或者是不是我出错了
比如:“i love you,爱”GB2312是需要14个字节
UTF8是需要15个字节,返回时候就是这些了啊,
我的地址是:robin-fox@sohu.com,
谁能回答以下,感谢!! ( robin_fox_nan 发表于 2006-3-19 20:20:00)
晕.格式没有了
原文请看
http://www.kbadboy.com/viewfull.asp?id=33 ( 鬼龙之舞 发表于 2005-8-25 16:13:00)
支持楼主!是因为你我才写出来的,不管是在体积还是在速度,相信都比楼主的强一点,如果不考虑移植性的话
感谢楼主!!
UTF8toUnicode proc uses esi edi lpszBuf_OUT,lpszUTF8_IN
mov esi,lpszUTF8_IN
mov edi,lpszBuf_OUT
.while TRUE
mov al,[esi]
.if sbyte ptr al <0
mov al,[esi]
and al,00001111b
shl al,4
mov [edi+1],al
mov al,[esi+1]
and al,00111100b
shr al,2
or [edi+1],al
mov al,[esi+1]
and al,11b
shl al,6
mov [edi+0],al
mov al,[esi+2]
and al,00111111b
or [edi+0],al
add edi,2
add esi,3
.elseif al
xor ah,ah
stosw
inc esi
.else
mov WORD ptr [edi],0
.break
.endif
.endw
ret
UTF8toUnicode endp ( 鬼龙之舞 发表于 2005-8-25 16:11:00)
UnicodetoUTF8 proc uses esi edi lpBuf_OUT,lpszUTF8_IN
mov esi,lpszUTF8_IN
mov edi,lpBuf_OUT
.while TRUE
mov ax,[esi]
.if ax==0
stosw
.break
.elseif ah==0
add esi,2
stosw
.else
mov al,[esi+1]
shr al,4
or al,11100000b
mov [edi+0],al
mov al,[esi+1]
and al,00001111b
shl al,2
or al,10000000b
mov ah,[esi+0]
shr ah,6
or al,ah
mov [edi+1],al
mov al,[esi+0]
and al,00111111b
or al,10000000b
mov [edi+2],al
add edi,3
add esi,2
.endif
.endw
ret
UnicodetoUTF8 endp ( 鬼龙之舞 发表于 2005-8-25 16:11:00)
帮忙弄以下 ( zztop5384 发表于 2005-4-18 10:35:00)
int WideCharToMultiByte(
UINT CodePage, // code page
DWORD dwFlags, // performance and mapping flags
LPCWSTR lpWideCharStr, // wide-character string
int cchWideChar, // number of chars in string
LPSTR lpMultiByteStr, // buffer for new string
int cbMultiByte, // size of buffer
LPCSTR lpDefaultChar, // default for unmappable chars
LPBOOL lpUsedDefaultChar // set when default char used
); //将宽字符转换成多个窄字符
这些只是函数原型,并没有具体实现 ( zztop5384 发表于 2005-4-18 10:27:00)
//对不起,少加了个扩号
WCHAR* CXmlProcess::UTF_8ToUnicode(char *pText)
{
char uchar[2];
WCHAR *unicode;
char_one = pText[0];
char_two = pText[1]);
char_three = pText[2];
uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
unicode = (WCHAR *)uchar;
return unicode;
}