c语言字节转字符串gbk,不依赖任何系统API,用c语言实现gbk/utf8/unicode编码转换-CSDN博客

汉字'我'

Unicode编码是 0x6211 0110

0010

010001

UTF8编码是 0xe68891 1110

0110

001000

100

010001

oxc0 11000000

0xE0 11100000

| Unicode符号范围 | UTF-8编码方式

n | (十六进制) | (二进制)

---+----------------------- -----------+--------------------------------------------------------------------------

1 | 0x00 - 0x7F | 0zzzzzzz

2 | 0x80 - 0x7FF | 110yyyyy 10zzzzzz

3 | 0x800 - 0xFFFF | 1110xxxx 10yyyyyy 10zzzzzz

--------------------------------------------------------------------------------------------------------------------

4 | 0x10000 - 0x1FFFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

5 | 0x200000 - 0x3FFFFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

6 | 0x4000000 - 0x7FFFFFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

表 UTF-8的编码规则

一：unicode转utf8

'我'的unicode编码0x6211，二进制为：01100010 00010001

将二进制形式分割成3段为0110 001000

010001

(分别是高4位、中间的6位、最后的低6位)

unicode转utf8只需要这3段分别填入1110xxxx 10yyyyyy 10zzzzzz中的xxxx yyyyyy zzzzzz

得utf8编码是0xe6889，二进制为： 11100110 10001000 100010001

int UnicodeToUtf8(char* pInput, char *pOutput)

{

int len = 0; //记录转换后的Utf8字符串的字节数

while (*pInput)

{

//处理一个unicode字符

char low = *pInput;//取出unicode字符的低8位

pInput++;

char high = *pInput;//取出unicode字符的高8位

int w=high<<8;

unsigned wchar = (high<<8)+low;//高8位和低8位组成一个unicode字符,加法运算级别高

if (wchar <= 0x7F ) //英文字符

{

pOutput[len] = (char)wchar; //取wchar的低8位

len++;

}

else if (wchar >=0x80 && wchar <= 0x7FF) //可以转换成双字节pOutput字符

{

pOutput[len] = 0xc0 |((wchar >> 6)&0x1f); //取出unicode编码低6位后的5位，填充到110yyyyy 10zzzzzz 的yyyyy中

len++;

pOutput[len] = 0x80 | (wchar & 0x3f); //取出unicode编码的低6位，填充到110yyyyy 10zzzzzz 的zzzzzz中

len++;

}

else if (wchar >=0x800 && wchar < 0xFFFF) //可以转换成3个字节的pOutput字符

{

pOutput[len] = 0xe0 | ((wchar >> 12)&0x0f); //高四位填入1110xxxx 10yyyyyy 10zzzzzz中的xxxx

len++;

pOutput[len] = 0x80 | ((wchar >> 6) & 0x3f); //中间6位填入1110xxxx 10yyyyyy 10zzzzzz中的yyyyyy

len++;

pOutput[len] = 0x80 | (wchar & 0x3f); //低6位填入1110xxxx 10yyyyyy 10zzzzzz中的zzzzzz

len++;

}

else //对于其他字节数的unicode字符不进行处理

{

return -1;

}

pInput ++;//处理下一个unicode字符

}

//utf8字符串后面，有个\0

pOutput [len]= 0;

return len;

}

二：utf8转unicode utf8二进制形式为1110xxxx 10yyyyyy 10zzzzzz

'我'的utf8编码0xe6889，二进制为:1110

0110 10

001000 10

0010001

分别提取里面的xxxx yyyyyy zzzzzz，然后组合成xxxxyyyy yyzzzzzz，

xxxxyyyy就是unicode的高8位，yyzzzzzz就是unicode的低8位

/*************************************************************************************************

* 将UTF8编码转换成Unicode(UCS-2LE)编码低地址存低位字节

* 参数：

* char* pInput 输入字符串

* char*pOutput 输出字符串

* 返回值：转换后的Unicode字符串的字节数，如果出错则返回-1

**************************************************************************************************/

//utf8转unicode

int Utf8ToUnicode(char* pInput, char* pOutput)

{

int outputSize = 0; //记录转换后的Unicode字符串的字节数

while (*pInput)

{

if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符(英文字母、数字)

{

*pOutput = *pInput;

pOutput++;

*pOutput = 0; //小端法表示，在高地址填补0

}

else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符

{

char high = *pInput;

pInput++;

char low = *pInput;

if ((low & 0xC0) != 0x80) //检查是否为合法的UTF8字符表示

{

return -1; //如果不是则报错

}

*pOutput = (high << 6) + (low & 0x3F);

pOutput++;

*pOutput = (high >> 2) & 0x07;

}

else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符

{

char high = *pInput;

pInput++;

char middle = *pInput;

pInput++;

char low = *pInput;

if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))

{

return -1;

}

*pOutput = (middle << 6) + (low & 0x3F);//取出middle的低两位与low的低6位，组合成unicode字符的低8位

pOutput++;

*pOutput = (high << 4) + ((middle >> 2) & 0x0F); //取出high的低四位与middle的中间四位，组合成unicode字符的高8位

}

else //对于其他字节数的UTF8字符不进行处理

{

return -1;

}

pInput ++;//处理下一个utf8字符

pOutput ++;

outputSize += 2;

}

//unicode字符串后面，有两个\0

*pOutput = 0;

pOutput++;

*pOutput = 0;

return outputSize;

}

//一个调用示例

int main(int argc, char** argv)

{

//汉字“我”的UTF8编码是0xe68891，Unicode编码是 0x6211

//1、unicode转utf8

char unicodeStr[3]={0x11,0x62,0x00};//我的unicode编码是0x6211，按低地址存低位字节

char* utf8Str = new char [5];

memset(utf8Str,0,5);

int num = UnicodeToUtf8(unicodeStr,utf8Str);

unsigned char* p = (unsigned char*)utf8Str;

for (int i = 0; i < num; i++)

{

printf("%0x", *p);

p++;

}//输出e68891

printf("\n");

delete utf8Str;

//2、utf8转unicode

//char utf8Str[4] = {0xe6, 0x88, 0x91, 0x00};

//char* unicodeStr = new char[8];

//memset(unicodeStr,0,8);

//int num = Utf8ToUnicode(utf8Str, unicodeStr);

//if (num == -1)

//{

//printf("Error!\n");

//}

//else

//{

// unsigned char* p = (unsigned char*)unicodeStr;

//for (int i = 0; i < num; i++)

//{

//printf("%0x", *p);

//p++;

//}//输出1162

//printf("\n");

//}

//delete unicodeStr;

return 0;

}

三、gbk与unicode互转

参照博客：

http://blog.csdn.net/tge7618291/article/details/7599902 http://www.ithao123.cn/content-1832906.html