GBK是Windows默认的汉字编码方式,又称国标。想在控制台或是控件上显示汉字使用这种编码方式最方便。
UTF-8是一种变长字节编码方式,把字节数边长,可以容纳更多的信息。
UTF-8编码方式:
当码长为1字节的时候,兼容ascii编码,格式为0xxxxxxx (x处表示有效位)
当码长为2字节的时候,格式为110xxxxx 10xxxxxx
当码长为3字节的时候,格式为1110xxxx 10xxxxxx 10xxxxxx
当码长为4字节的时候,格式为11110xxx 10xxxxxx 10xxxxxx 10xxxxxx...
来自:https://blog.csdn.net/haohulala/article/details/86600936
· UTF8编码的码长最大为6字节,1111110x 10...
· 如果一个汉字的utf8编码为110aaaaa 10bbbbbb 则其对应的Unicode编码为aaaaabbbbbb,其他同理
所以我们就有了对于单个字符的UTF-8到Unicode的转换过程
int GetUTF8Length(unsigned char *sUTF8){
unsigned char _s0=sUTF8[0];
if( _s0 >> 7 == 0b0 )return 1;
if( _s0 >> 5 == 0b110 )return 2;
if( _s0 >> 4 == 0b1110 )return 3;
if( _s0 >> 3 == 0b11110 )return 4;
if( _s0 >> 2 == 0b111110 )return 5;
if( _s0 >> 1 == 0b1111110 )return 6;
return 0;
}
unsigned int UTF8_To_Unicode(unsigned char *sUTF8){
int l=GetUTF8Length(sUTF8);
if(l==1) return sUTF8[0];
if(l==2) return ((sUTF8[0]& 0b00011111)<<6)
+ (sUTF8[1]& 0b00111111);//0b110xxxxx
if(l==3) return ((sUTF8[0]& 0b00001111)<<12)
+ ((sUTF8[1]& 0b00111111)<<6)
+ (sUTF8[2]& 0b00111111);//0b1110xxxx
if(l==4) return ((sUTF8[0]& 0b00000111)<<18)
+ ((sUTF8[1]& 0b00111111)<<12)
+ ((sUTF8[2]& 0b00111111)<<6)
+ (sUTF8[3]& 0b00111111);//0b11110xxx
if(l==5) return ((sUTF8[0]& 0b00000011)<<24)
+ ((sUTF8[1]& 0b00111111)<<18)
+ ((sUTF8[2]& 0b00111111)<<12)
+ ((sUTF8[3]& 0b00111111)<<6)
+ (sUTF8[4]& 0b00111111);//0b111110xx
if(l==6) return ((sUTF8[0]& 0b00000001)<<30)
+ ((sUTF8[1]& 0b00111111)<<24)
+ ((sUTF8[2]& 0b00111111)<<18)
+ ((sUTF8[3]& 0b00111111)<<12)
+ ((sUTF8[4]& 0b00111111)<<6)
+ (sUTF8[5]& 0b00111111);//0b1111110x
}
我暂且还没想出一种好的算法把这几大坨if用一个简短的表达式表示,欢迎大家讨论。
至于Unicode到GBK的转换,c++有一些函数可以帮忙
char* Unicode_To_GBK(unsigned int cUni){//Unicode 16 BE
char* sGBK=new char[6];
wsprintf(sGBK,"%wc",(wchar_t)cUni);
return sGBK;
}
下面是读取汉字源码
#include <Windows.h>
#include<iostream>
#include<cstdio>
//#include<bitset>
#include<cstring>
using namespace std;
//void binoutput(int n){ //二进制输出
// cout<<bitset<10>(n)<<"\n";
//}
int GetUTF8Length(char *sUTF8){
unsigned char _s0=sUTF8[0];
if( _s0 >> 7 == 0b0 )return 1;
if( _s0 >> 5 == 0b110 )return 2;
if( _s0 >> 4 == 0b1110 )return 3;
if( _s0 >> 3 == 0b11110 )return 4;
if( _s0 >> 2 == 0b111110 )return 5;
if( _s0 >> 1 == 0b1111110 )return 6;
return 0;
}
unsigned int UTF8_To_Unicode(unsigned char *sUTF8){
int l=GetUTF8Length(reinterpret_cast<char *>(sUTF8));
if(l==1) return sUTF8[0];
if(l==2) return ((sUTF8[0]& 0b00011111)<<6)
+ (sUTF8[1]& 0b00111111);//0b110xxxxx
if(l==3) return ((sUTF8[0]& 0b00001111)<<12)
+ ((sUTF8[1]& 0b00111111)<<6)
+ (sUTF8[2]& 0b00111111);//0b1110xxxx
if(l==4) return ((sUTF8[0]& 0b00000111)<<18)
+ ((sUTF8[1]& 0b00111111)<<12)
+ ((sUTF8[2]& 0b00111111)<<6)
+ (sUTF8[3]& 0b00111111);//0b11110xxx
if(l==5) return ((sUTF8[0]& 0b00000011)<<24)
+ ((sUTF8[1]& 0b00111111)<<18)
+ ((sUTF8[2]& 0b00111111)<<12)
+ ((sUTF8[3]& 0b00111111)<<6)
+ (sUTF8[4]& 0b00111111);//0b111110xx
if(l==6) return ((sUTF8[0]& 0b00000001)<<30)
+ ((sUTF8[1]& 0b00111111)<<24)
+ ((sUTF8[2]& 0b00111111)<<18)
+ ((sUTF8[3]& 0b00111111)<<12)
+ ((sUTF8[4]& 0b00111111)<<6)
+ (sUTF8[5]& 0b00111111);//0b1111110x
}
char* Unicode_To_GBK(unsigned int cUni){//Unicode 16 BE
char* sGBK=new char[6];
wsprintf(sGBK,"%wc",(wchar_t)cUni);
return sGBK;
}
char* UTF8_To_GBK(char *sUTF8){
return Unicode_To_GBK(UTF8_To_Unicode(reinterpret_cast<unsigned char *>(sUTF8)));
}
int main(){
/*
啊 GBK: 0xB0 0xA1
Unicode-16 LE: 0x4A 0x55
Unicode-16 BE: 0x55 0x4A
UTF-8: 0xE5 0x95 0x8A
*/
FILE *fp=fopen("hanzi.txt","r");
char str[100]={0},*pstr,*endstr;
while(!feof(fp)){
fgets(str,100,fp);
pstr=str;
endstr=str+strlen(str);
while(pstr < endstr){
printf("%s",UTF8_To_GBK(pstr));
pstr+= GetUTF8Length(pstr);
}
}
fclose(fp);
}
提一嘴reinterpret_cast,强制转换真是太爽了,当年被一堆cannot convert from type to const type 要折磨疯了。