/*
| Unicode符号范围 | UTF-8编码方式
n | (十六进制) | (二进制)
---+-----------------------+------------------------------------------------------
1 | 0000 0000 - 0000 007F | 0xxxxxxx
2 | 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
3 | 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
4 | 0001 0000 - 001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5 | 0020 0000 - 03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
int ucs2utf(unsigned long _in, unsigned char *_out)
{
if (_in <= 0x0000007F)
{
*(_out + 0) = _in & 0x7F;
return 1;
}
else if (_in >= 0x00000080 && _in <= 0x000007FF)
{
*(_out + 1) = (_in & 0x3F) | 0x80;
*(_out + 0) = ((_in >> 6) & 0x3F) | 0xC0;
return 2;
}
else if (_in >= 0x00000800 && _in <= 0x0000FFFF)
{
*(_out + 2) = (_in & 0x3F) | 0x80;
*(_out + 1) = ((_in >> 6) & 0x3F) | 0x80;
*(_out + 0) = ((_in >> 12) & 0x3F) | 0xE0;
return 3;
}
else if (_in >= 0x00010000 && _in <= 0x001FFFFF)
{
*(_out + 3) = (_in & 0x3F) | 0x80;
*(_out + 2) = ((_in >> 6) & 0x3F) | 0x80;
*(_out + 1) = ((_in >> 12) & 0x3F) | 0x80;
*(_out + 0) = ((_in >> 18) & 0x3F) | 0xF0;
return 4;
}
else if (_in >= 0x00200000 && _in <= 0x03FFFFFF)
{
*(_out + 4) = (_in & 0x3F) | 0x80;
*(_out + 3) = ((_in >> 6) & 0x3F) | 0x80;
*(_out + 2) = ((_in >> 12) & 0x3F) | 0x80;
*(_out + 1) = ((_in >> 18) & 0x3F) | 0x80;
*(_out + 0) = ((_in >> 24) & 0x3F) | 0xF8;
return 5;
}
else if (_in >= 0x04000000 && _in <= 0x7FFFFFFF)
{
*(_out + 5) = (_in & 0x3F) | 0x80;
*(_out + 4) = ((_in >> 6) & 0x3F) | 0x80;
*(_out + 3) = ((_in >> 12) & 0x3F) | 0x80;
*(_out + 2) = ((_in >> 18) & 0x3F) | 0x80;
*(_out + 1) = ((_in >> 24) & 0x3F) | 0x80;
*(_out + 0) = ((_in >> 30) & 0x3F) | 0xFC;
return 6;
}
return 0;
}
int get_utf8_size(unsigned char _in)
{
if (_in >= 0xFC && _in < 0xFE)
{
return 6;
}
else if (_in >= 0xF8)
{
return 5;
}
else if (_in >= 0xF0)
{
return 4;
}
else if (_in >= 0xE0)
{
return 3;
}
else if (_in >= 0xC0 )
{
return 2;
}
else if (0 == (_in & 0x80))
{
return 1;
}
return 0;
}
int utf2ucs(const unsigned char *_in, unsigned long *_out)
{
char byte_1, byte_2, byte_3, byte_4, byte_5, byte_6;
unsigned char *p = (unsigned char *)_out;
int utfbytes = get_utf8_size(*_in);
switch (utfbytes)
{
case 1://0xxxxxxx
*(p + 0) = *_in;
break;
case 2://110xxxxx 10xxxxxx
byte_1 = *_in;
byte_2 = *(_in + 1);
if ((byte_2 & 0xC0) != 0x80) {
return 0;
}
*(p + 0) = (byte_1 << 6) + (byte_2 & 0x3F);
*(p + 1) = (byte_1 >> 2) & 0x07;
break;
case 3://1110xxxx 10xxxxxx 10xxxxxx
byte_1 = *_in;
byte_2 = *(_in + 1);
byte_3 = *(_in + 2);
if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80) {
return 0;
}
*(p + 0) = (byte_2 << 6) + (byte_3 & 0x3F);
*(p + 1) = (byte_1 << 4) + ((byte_2 >> 2) & 0x0F);
break;
case 4://11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
byte_1 = *_in;
byte_2 = *(_in + 1);
byte_3 = *(_in + 2);
byte_4 = *(_in + 3);
if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 || (byte_4 & 0xC0) != 0x80) {
return 0;
}
*(p + 0) = (byte_3 << 6) + (byte_4 & 0x3F);
*(p + 1) = (byte_2 << 4) + ((byte_3 >> 2) & 0x0F);
*(p + 2) = ((byte_1 << 2) & 0x1C) + ((byte_2 >> 4) & 0x03);
break;
case 5://111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
byte_1 = *_in;
byte_2 = *(_in + 1);
byte_3 = *(_in + 2);
byte_4 = *(_in + 3);
byte_5 = *(_in + 4);
if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 || (byte_4 & 0xC0) != 0x80
|| (byte_5 & 0xC0) != 0x80) {
return 0;
}
*(p + 0) = (byte_4 << 6) + (byte_5 & 0x3F);
*(p + 1) = (byte_3 << 4) + ((byte_4 >> 2) & 0x0F);
*(p + 2) = (byte_2 << 2) + ((byte_3 >> 4) & 0x03);
*(p + 3) = (byte_1 << 6);
break;
case 6://1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
byte_1 = *_in;
byte_2 = *(_in + 1);
byte_3 = *(_in + 2);
byte_4 = *(_in + 3);
byte_5 = *(_in + 4);
byte_6 = *(_in + 5);
if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 ||
(byte_4 & 0xC0) != 0x80 || (byte_5 & 0xC0) != 0x80 || (byte_6 & 0xC0) != 0x80) {
return 0;
}
*(p + 0) = (byte_5 << 6) + (byte_6 & 0x3F);
*(p + 1) = (byte_5 << 4) + ((byte_6 >> 2) & 0x0F);
*(p + 2) = (byte_3 << 2) + ((byte_4 >> 4) & 0x03);
*(p + 3) = ((byte_1 << 6) & 0x40) + (byte_2 & 0x3F);
break;
}
return utfbytes;
}
int isutf8format(const char *_in)
{
int utfbytes = 0;
unsigned char c = 0;
const char *p = _in;
if (NULL == _in)
{
return 0;
}
c = (unsigned char)*p++;
if (utfbytes == 0)
{
if (0 == (utfbytes = get_utf8_size(c)))
{
return 0;
}
}
--utfbytes;
while (utfbytes > 0 && p)
{
c = (unsigned char)*p;
if ((c & 0xC0) != 0x80)
{
return 0;
}
--utfbytes;
++p;
}
return 1;
}
unsigned char utf8_look_for_table[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
#define UTFLEN(x) utf8_look_for_table[(x)]
//计算_in字符数目
int get_utf8_len(char *_in)
{
int clen = strlen(_in);
int len = 0;
for(char *ptr = _in; *ptr != 0 && len < clen; len++, ptr += UTFLEN((unsigned char)*ptr));
return len;
}
unicode与utf-8相互转化
最新推荐文章于 2023-06-27 20:01:02 发布