4.2)汉字编码判断
4.2.1)判断是否是ASCII
/**/
/*------------------------------------------------------------------------
Procedure: IsAscii ID:1
Purpose: 判断一个字符(8byte)是否是一个ASCII字符。
Input: 一个无符号的字符。
Output: 如果是ASCII字符,则输出1。
Errors: 如果不是ASCII字符,则输出0。
------------------------------------------------------------------------*/
int IsAscii(unsigned char character)
... {
if (character>0 && character<128)
return 1;
else
return 0;
}
Procedure: IsAscii ID:1
Purpose: 判断一个字符(8byte)是否是一个ASCII字符。
Input: 一个无符号的字符。
Output: 如果是ASCII字符,则输出1。
Errors: 如果不是ASCII字符,则输出0。
------------------------------------------------------------------------*/
int IsAscii(unsigned char character)
... {
if (character>0 && character<128)
return 1;
else
return 0;
}
4.2.2)gbk编码中判断汉字编码形式
/**/
/*------------------------------------------------------------------------
Procedure: IsChineseStandard ID:1
Purpose: 判断中文字的编码形式。(一个字由2个字符组成)
Input: 字的第一个字符指针。
Output: 如果是中文GB2312编码,那么当字是汉字则返回0,当字是标
点则返回1。
Errors: 当字的第一个字符是ASCII码,则返回2,其它编码形式则返
回3。
------------------------------------------------------------------------*/
int IsChineseStandard(unsigned char * CharP)
... {
int checkCode;
if (*CharP>=0xB0 && *CharP<=0xF7 && *(CharP+1)>=0xA1 && *(CharP+1)<=0xFE)...{ //汉字判断
checkCode=0;
return checkCode;
}
else if (*CharP>=0xA1 && *CharP<=0xAF && *(CharP+1)>=0xA1 && *(CharP+1)<=0xFF) //汉字标点判断
return 1;
else if (IsAscii(*CharP)) //先考虑是ASCII码吗
return 2;
else
return 3; //余下的则为其它编码形式
}
Procedure: IsChineseStandard ID:1
Purpose: 判断中文字的编码形式。(一个字由2个字符组成)
Input: 字的第一个字符指针。
Output: 如果是中文GB2312编码,那么当字是汉字则返回0,当字是标
点则返回1。
Errors: 当字的第一个字符是ASCII码,则返回2,其它编码形式则返
回3。
------------------------------------------------------------------------*/
int IsChineseStandard(unsigned char * CharP)
... {
int checkCode;
if (*CharP>=0xB0 && *CharP<=0xF7 && *(CharP+1)>=0xA1 && *(CharP+1)<=0xFE)...{ //汉字判断
checkCode=0;
return checkCode;
}
else if (*CharP>=0xA1 && *CharP<=0xAF && *(CharP+1)>=0xA1 && *(CharP+1)<=0xFF) //汉字标点判断
return 1;
else if (IsAscii(*CharP)) //先考虑是ASCII码吗
return 2;
else
return 3; //余下的则为其它编码形式
}
4.2.3)unicode判断是否是汉字
bool isChineseChar(unsigned char* ch)
{
if( *ch>=0x80 && *(ch+1)>=0x80 )
return true;
return false;
}
4.2.4)utf-8判断是否是汉字
汉字的编码区域是:
0080-07FF
4.2.5)各种编码之间的转换
注:
utf8
与
gbk
的转换要通过
unicode
进行中转。
1)utft8
与
unicode
的转换
/**/
/*--------------------------------------------------------------------------------------------------------------------
UTF-8就是以8位为单元对UCS进行编码。从UCS-2到UTF-8的编码方式如下:
UCS-2编码(16进制) UTF-8 字节流(二进制)
0000 - 007F 0xxxxxxx
0080 - 07FF 110xxxxx 10xxxxxx
0800 - FFFF 1110xxxx 10xxxxxx 10xxxxxx
例如“汉”字的Unicode编码是6C49。6C49在0800-FFFF之间,所以肯定要用3字节模板了:
1110xxxx 10xxxxxx 10xxxxxx。
将6C49写成二进制是:0110 110001 001001, 用这个比特流依次代替模板中的x,得到:11100110 10110001 10001001,即E6 B1 89。
--------------------------------------------------------------------------------------------------------------------*/
// 先根据转换规则计算utf8串长度,同时根据输出缓冲区大小判断可以转换的ucs2码数量
// 转换
int ucs2ToUtf8( const unsigned short * ucs, unsigned char * cbuf, int cbuf_len)
... {
int i, j, l, max_i;
unsigned short w, w1;
i = 0;
max_i = 0;
j = 0;
w = ucs[i];
while (w) ...{
if ( w <= 0x7f ) ...{
l = 1;
}
else if ( w <= 0x7ff ) ...{
l = 2;
}
else ...{
l = 3;
}
j += l; // 累计输出长度
if ( j <= cbuf_len ) ...{
max_i = i;
}
i++; // 下一个ucs2
w = ucs[i];
}
if ( !cbuf || ( cbuf_len == 0 ) ) ...{
return j;
}
j = 0;
for ( i = 0; i <= max_i; i++ ) ...{
w = ucs[i];
if ( w <= 0x7f ) ...{
l = 1;
cbuf[j++] = (unsigned char)w;
}
else if ( w <= 0x7ff ) ...{
l = 2;
w1 = cut_word(w, 10, 6);
cbuf[j++] = 0xc0|w1;
w1 = cut_word(w, 5, 0);
cbuf[j++] = 0x80|w1;
}
else ...{
l = 3;
w1 = cut_word(w, 15, 12);
cbuf[j++] = 0xe0|w1;
w1 = cut_word(w, 11, 6);
cbuf[j++] = 0x80|w1;
w1 = cut_word(w, 5, 0);
cbuf[j++] = 0x80|w1;
}
}
return j;
}
UTF-8就是以8位为单元对UCS进行编码。从UCS-2到UTF-8的编码方式如下:
UCS-2编码(16进制) UTF-8 字节流(二进制)
0000 - 007F 0xxxxxxx
0080 - 07FF 110xxxxx 10xxxxxx
0800 - FFFF 1110xxxx 10xxxxxx 10xxxxxx
例如“汉”字的Unicode编码是6C49。6C49在0800-FFFF之间,所以肯定要用3字节模板了:
1110xxxx 10xxxxxx 10xxxxxx。
将6C49写成二进制是:0110 110001 001001, 用这个比特流依次代替模板中的x,得到:11100110 10110001 10001001,即E6 B1 89。
--------------------------------------------------------------------------------------------------------------------*/
// 先根据转换规则计算utf8串长度,同时根据输出缓冲区大小判断可以转换的ucs2码数量
// 转换
int ucs2ToUtf8( const unsigned short * ucs, unsigned char * cbuf, int cbuf_len)
... {
int i, j, l, max_i;
unsigned short w, w1;
i = 0;
max_i = 0;
j = 0;
w = ucs[i];
while (w) ...{
if ( w <= 0x7f ) ...{
l = 1;
}
else if ( w <= 0x7ff ) ...{
l = 2;
}
else ...{
l = 3;
}
j += l; // 累计输出长度
if ( j <= cbuf_len ) ...{
max_i = i;
}
i++; // 下一个ucs2
w = ucs[i];
}
if ( !cbuf || ( cbuf_len == 0 ) ) ...{
return j;
}
j = 0;
for ( i = 0; i <= max_i; i++ ) ...{
w = ucs[i];
if ( w <= 0x7f ) ...{
l = 1;
cbuf[j++] = (unsigned char)w;
}
else if ( w <= 0x7ff ) ...{
l = 2;
w1 = cut_word(w, 10, 6);
cbuf[j++] = 0xc0|w1;
w1 = cut_word(w, 5, 0);
cbuf[j++] = 0x80|w1;
}
else ...{
l = 3;
w1 = cut_word(w, 15, 12);
cbuf[j++] = 0xe0|w1;
w1 = cut_word(w, 11, 6);
cbuf[j++] = 0x80|w1;
w1 = cut_word(w, 5, 0);
cbuf[j++] = 0x80|w1;
}
}
return j;
}
//
先根据转换规则计算ucs2串长度, 输出缓冲区大小判断需要转换的ucs2码数量
// 转换
int utf8ToUcs2( const unsigned char * s, unsigned short * wbuf, int wbuf_len)
... {
int i, j, k;
unsigned char c;
unsigned char c3, c2, c4;
i = 0;
j = 0;
c = s[i++];
while (c) ...{
c3 = c & 0xe0;
c4 = c & 0xf0;
if ( (c & 0x80) == 0 ) ...{
// 单字节
}
else if ( c3 == 0xc0 ) ...{
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
}
else if ( c4 == 0xe0 ) ...{
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
}
else ...{
break; // 错误,认为结束
}
j++;
c = s[i++];
}
if ( !wbuf || ( wbuf_len == 0 ) ) ...{
return j;
}
if ( wbuf_len < j ) ...{
j = wbuf_len;
}
i = 0;
for ( k = 0; k < j; k++ ) ...{
c = s[i++];
c3 = c & 0xe0;
c4 = c & 0xf0;
if ( (c & 0x80) == 0 ) ...{
// 单字节
wbuf[k] = c;
}
else if ( c3 == 0xc0 ) ...{
// 取自两个字节
wbuf[k] = (c & 0x1f) << 6;
c = s[i++];
wbuf[k] |= ( c & 0x3f );
}
else if ( c4 == 0xe0 ) ...{
// 取自3个字节
wbuf[k] = c << 12;
c = s[i++];
wbuf[k] |= ( (c & 0x3f) << 6 );
c = s[i++];
wbuf[k] |= (c & 0x3f);
}
}
return j;
}
// 转换
int utf8ToUcs2( const unsigned char * s, unsigned short * wbuf, int wbuf_len)
... {
int i, j, k;
unsigned char c;
unsigned char c3, c2, c4;
i = 0;
j = 0;
c = s[i++];
while (c) ...{
c3 = c & 0xe0;
c4 = c & 0xf0;
if ( (c & 0x80) == 0 ) ...{
// 单字节
}
else if ( c3 == 0xc0 ) ...{
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
}
else if ( c4 == 0xe0 ) ...{
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
c2 = s[i++] & 0xc0;
if ( c2 != 0x80 ) ...{
break;
}
}
else ...{
break; // 错误,认为结束
}
j++;
c = s[i++];
}
if ( !wbuf || ( wbuf_len == 0 ) ) ...{
return j;
}
if ( wbuf_len < j ) ...{
j = wbuf_len;
}
i = 0;
for ( k = 0; k < j; k++ ) ...{
c = s[i++];
c3 = c & 0xe0;
c4 = c & 0xf0;
if ( (c & 0x80) == 0 ) ...{
// 单字节
wbuf[k] = c;
}
else if ( c3 == 0xc0 ) ...{
// 取自两个字节
wbuf[k] = (c & 0x1f) << 6;
c = s[i++];
wbuf[k] |= ( c & 0x3f );
}
else if ( c4 == 0xe0 ) ...{
// 取自3个字节
wbuf[k] = c << 12;
c = s[i++];
wbuf[k] |= ( (c & 0x3f) << 6 );
c = s[i++];
wbuf[k] |= (c & 0x3f);
}
}
return j;
}
2
)
gbk
与
unicode
的转换
static int getUniLenOfGbStr( const unsigned char *p )
{
int len = 0;
while ( *p ) {
if ( *p & 0x80 ) {
p += 2;
}
else {
p += 1;
}
len++;
}
return len;
}
static int getGbLenOfUniStr( const unsigned short *p )
{
int len = 0;
while ( *p ) {
if ( *p < 0x80 ) {
len += 1;
}
else {
len += 2; /* convert unsupport char to ?? */
}
p++;
}
return len;
}