关于utf8和unicode的编码原理可以参考这篇文章:
http://hi.baidu.com/dustin_xiao/blog/item/2ab75b24c27ca32ed507426f.html
下面是实现代码,只实现了中文和英文字符的转换部分:
1
int
GetUtf8ByteNumForWord(
char
firstCh)
2 {
3 int nRet = 0 ;
4 __asm
5 {
6 movzx ecx, byte ptr[firstCh]
7 and ecx, 0xE0
8 jz done
9 test ecx, 0x80
10 jnz lbm
11 mov nRet, 1
12 jmp done
13 lbm:
14 cmp cl, 0xE0
15 jz lb3
16 cmp cl, 0x0C
17 jz lb2
18 jmp done
19 lb3:
20 mov nRet, 3
21 jmp done
22 lb2:
23 mov nRet, 2
24 done:
25 }
26 return nRet;
27 }
28 void Utf8ToUnicode( const char * utf8, int len, wchar_t * unicode)
29 {
30 int i = 0 ;
31 int j = 0 ;
32 char * temp = ( char * )unicode;
33 // 循环解析
34 while (i < len)
35 {
36 int nByteNum = GetUtf8ByteNumForWord(utf8[i]);
37 if (nByteNum == 0 )
38 {
39 return ;
40 }
41 switch (nByteNum)
42 {
43 case 1 :
44 temp[j] = utf8[i];
45 temp[j + 1 ] = 0 ;
46 break ;
47 case 2 :
48 temp[j] = utf8[i];
49 temp[j + 1 ] = utf8[i + 1 ];
50 break ;
51 case 3 :
52 // 这里就开始进行UTF8->Unicode
53 temp[j + 1 ] = ((utf8[i] & 0x0F ) << 4 ) | ((utf8[i + 1 ] >> 2 ) & 0x0F );
54 temp[j] = ((utf8[i + 1 ] & 0x03 ) << 6 ) + (utf8[i + 2 ] & 0x3F );
55 break ;
56 default :
57 break ;
58 }
59 j += 2 ;
60 i += nByteNum;
61 }
62 temp[j] = 0 ;
63 temp[j + 1 ] = 0 ;
64 }
65
2 {
3 int nRet = 0 ;
4 __asm
5 {
6 movzx ecx, byte ptr[firstCh]
7 and ecx, 0xE0
8 jz done
9 test ecx, 0x80
10 jnz lbm
11 mov nRet, 1
12 jmp done
13 lbm:
14 cmp cl, 0xE0
15 jz lb3
16 cmp cl, 0x0C
17 jz lb2
18 jmp done
19 lb3:
20 mov nRet, 3
21 jmp done
22 lb2:
23 mov nRet, 2
24 done:
25 }
26 return nRet;
27 }
28 void Utf8ToUnicode( const char * utf8, int len, wchar_t * unicode)
29 {
30 int i = 0 ;
31 int j = 0 ;
32 char * temp = ( char * )unicode;
33 // 循环解析
34 while (i < len)
35 {
36 int nByteNum = GetUtf8ByteNumForWord(utf8[i]);
37 if (nByteNum == 0 )
38 {
39 return ;
40 }
41 switch (nByteNum)
42 {
43 case 1 :
44 temp[j] = utf8[i];
45 temp[j + 1 ] = 0 ;
46 break ;
47 case 2 :
48 temp[j] = utf8[i];
49 temp[j + 1 ] = utf8[i + 1 ];
50 break ;
51 case 3 :
52 // 这里就开始进行UTF8->Unicode
53 temp[j + 1 ] = ((utf8[i] & 0x0F ) << 4 ) | ((utf8[i + 1 ] >> 2 ) & 0x0F );
54 temp[j] = ((utf8[i + 1 ] & 0x03 ) << 6 ) + (utf8[i + 2 ] & 0x3F );
55 break ;
56 default :
57 break ;
58 }
59 j += 2 ;
60 i += nByteNum;
61 }
62 temp[j] = 0 ;
63 temp[j + 1 ] = 0 ;
64 }
65
测试代码如下:
1
std::ifstream fin(
"
debug\\Test.txt
"
);
2 const unsigned int L_MAX_LINE = 1024 ;
3 char utf8[L_MAX_LINE];
4 wchar_t unicode[L_MAX_LINE];
5 while (fin.getline(utf8,L_MAX_LINE))
6 {
7 Utf8ToUnicode(utf8,strlen(utf8),unicode);
8 MessageBoxW( 0 ,unicode, 0 , 0 );
9 }
10 fin.close();
2 const unsigned int L_MAX_LINE = 1024 ;
3 char utf8[L_MAX_LINE];
4 wchar_t unicode[L_MAX_LINE];
5 while (fin.getline(utf8,L_MAX_LINE))
6 {
7 Utf8ToUnicode(utf8,strlen(utf8),unicode);
8 MessageBoxW( 0 ,unicode, 0 , 0 );
9 }
10 fin.close();
用一篇文章进行测试,结果如下: