1.windows下需要unicode来处理非ascii的字符,如中文文件路径.
2.但是做字符串处理时又需要转换成中间的utf8处理,这就涉及到了互转.
3.参考unicode和utf8关系:
http://baike.baidu.com/view/40801.htm
#include <stdlib.h>
#include <cstdio>
#include <fstream>
using namespace std;
void Utf82Unicode(const char* utf8, string* unicode)
{
char* utf_8 = strdup(utf8);
char* curr_utf_8 = utf_8;
long utf_8_index = 0;
long utf8_size = strlen(utf8);
long unicode_index = 0;
unsigned char ch;
char ch_null = 0x0;
int is_done = 0;
while(utf_8_index < utf8_size)
{
is_done = 0;
ch = curr_utf_8[0];
ch = ch >> 4;
if(ch == 0xF)
{
*unicode += (curr_utf_8[3] & 0x3F) | ((curr_utf_8[2] & 0x3) << 6);
*unicode += ((curr_utf_8[2] & 0x3C) >> 2) | ((curr_utf_8[1] & 0xF) << 4);
*unicode += ((curr_utf_8[1] & 0x3C) >> 4) | ((curr_utf_8[0] & 0x07) << 2);
utf_8_index += 4;
is_done = 1;
}
ch = ch >> 1;
if(!is_done && ch == 0x07)
{
*unicode += (curr_utf_8[2] & 0x3F) | ((curr_utf_8[1] & 0x3) << 6);
*unicode += ((curr_utf_8[1] & 0x3C) >> 2) | ((curr_utf_8[0] & 0xF) << 4);
*unicode += ch_null;
utf_8_index += 3;
is_done = 1;
}
ch = ch >> 1;
if(!is_done && ch == 0x03)
{
*unicode += (curr_utf_8[1] & 0x3F) | ((curr_utf_8[0] & 0x3) << 6);
*unicode += (curr_utf_8[0] & 0x3C) >> 2;
*unicode += ch_null;
utf_8_index += 2;
is_done = 1;
}
ch = ch >> 1;
if(!is_done && ch == 0x0)
{
*unicode += curr_utf_8[0];
*unicode += ch_null;
*unicode += ch_null;
utf_8_index += 1;
is_done = 1;
}
curr_utf_8 = utf_8 + utf_8_index;
}
}
int main()
{
char utf82[12];
memset(utf82, 0, 12);
utf82[0] = 0xE6;
utf82[1] = 0x88;
utf82[2] = 0x91;
string unicode2;
Utf82Unicode(utf82, &unicode2);
printf("%x,%x\n", (unsigned char) unicode2[0], unicode2[1]);
cout<<unicode2<<":"<<unicode2.size() <<endl;
return 0;
}