unicode与utf-8相互转化

/* 
|  Unicode符号范围      |  UTF-8编码方式  
n |  (十六进制)           | (二进制)  
---+-----------------------+------------------------------------------------------  
1 | 0000 0000 - 0000 007F |                                              0xxxxxxx  
2 | 0000 0080 - 0000 07FF |                                     110xxxxx 10xxxxxx  
3 | 0000 0800 - 0000 FFFF |                            1110xxxx 10xxxxxx 10xxxxxx  
4 | 0001 0000 - 001F FFFF |                   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  
5 | 0020 0000 - 03FF FFFF |          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  
6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  
*/


int ucs2utf(unsigned long _in, unsigned char *_out)
{
	if (_in <= 0x0000007F)
	{
		*(_out + 0) = _in & 0x7F;
		return 1;
	} 
	else if (_in >= 0x00000080 && _in <= 0x000007FF)
	{
		*(_out + 1) = (_in & 0x3F)			| 0x80;
		*(_out + 0)	= ((_in >> 6) & 0x3F)	| 0xC0;
		return 2;
	}
	else if (_in >= 0x00000800 && _in <= 0x0000FFFF)
	{
		*(_out + 2) = (_in & 0x3F)			| 0x80;
		*(_out + 1) = ((_in >> 6) & 0x3F)	| 0x80;
		*(_out + 0) = ((_in >> 12) & 0x3F)	| 0xE0;
		return 3;
	}
	else if (_in >= 0x00010000 && _in <= 0x001FFFFF)
	{
		*(_out + 3) = (_in & 0x3F)			| 0x80;
		*(_out + 2) = ((_in >> 6) & 0x3F)	| 0x80;
		*(_out + 1)	= ((_in >> 12) & 0x3F)	| 0x80;
		*(_out + 0) = ((_in >> 18) & 0x3F)	| 0xF0;
		return 4;
	}
	else if (_in >= 0x00200000 && _in <= 0x03FFFFFF)
	{
		*(_out + 4) = (_in & 0x3F)			| 0x80;
		*(_out + 3) = ((_in >> 6) & 0x3F)	| 0x80;
		*(_out + 2)	= ((_in >> 12) & 0x3F)	| 0x80;
		*(_out + 1)	= ((_in >> 18) & 0x3F)	| 0x80;
		*(_out + 0) = ((_in >> 24) & 0x3F)	| 0xF8;
		return 5;
	}
	else if (_in >= 0x04000000 && _in <= 0x7FFFFFFF)
	{
		*(_out + 5) = (_in & 0x3F)			| 0x80;
		*(_out + 4) = ((_in >> 6) & 0x3F)	| 0x80;
		*(_out + 3)	= ((_in >> 12) & 0x3F)	| 0x80;
		*(_out + 2)	= ((_in >> 18) & 0x3F)	| 0x80;
		*(_out + 1)	= ((_in >> 24) & 0x3F)	| 0x80;
		*(_out + 0) = ((_in >> 30) & 0x3F)	| 0xFC;
		return 6;
	}
	return 0;
}

int get_utf8_size(unsigned char _in)
{
	if (_in >= 0xFC && _in < 0xFE)
	{
		return 6;
	}
	else if (_in >= 0xF8)
	{
		return 5;
	}
	else if (_in >= 0xF0)
	{
		return 4;
	}
	else if (_in >= 0xE0)
	{
		return 3;
	}
	else if (_in >= 0xC0	)
	{
		return 2;
	}
	else if (0 == (_in & 0x80))
	{
		return 1;
	}
	return 0;
}

int utf2ucs(const unsigned char *_in, unsigned long *_out)
{
	char byte_1, byte_2, byte_3, byte_4, byte_5, byte_6;
	unsigned char *p = (unsigned char *)_out;
	int utfbytes = get_utf8_size(*_in);
	switch (utfbytes)
	{
	case 1://0xxxxxxx
		*(p + 0) = *_in;
		break;
	case 2://110xxxxx 10xxxxxx
		byte_1 = *_in;
		byte_2 = *(_in + 1);
		if ((byte_2 & 0xC0) != 0x80) {
			return 0;
		}
		*(p + 0) = (byte_1 << 6) + (byte_2 & 0x3F);
		*(p + 1) = (byte_1 >> 2) & 0x07;
		break;
	case 3://1110xxxx 10xxxxxx 10xxxxxx
		byte_1 = *_in;
		byte_2 = *(_in + 1);
		byte_3 = *(_in + 2);
		if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80) {
			return 0;
		} 
		*(p + 0) = (byte_2 << 6) + (byte_3 & 0x3F);
		*(p + 1) = (byte_1 << 4) + ((byte_2 >> 2) & 0x0F);
		break;
	case 4://11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		byte_1 = *_in;
		byte_2 = *(_in + 1);
		byte_3 = *(_in + 2);
		byte_4 = *(_in + 3);
		if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 || (byte_4 & 0xC0) != 0x80) {
			return 0;
		}
		*(p + 0) = (byte_3 << 6) + (byte_4 & 0x3F);  
		*(p + 1) = (byte_2 << 4) + ((byte_3 >> 2) & 0x0F);  
		*(p + 2) = ((byte_1 << 2) & 0x1C)  + ((byte_2 >> 4) & 0x03);
		break;
	case 5://111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		byte_1 = *_in;
		byte_2 = *(_in + 1);
		byte_3 = *(_in + 2);
		byte_4 = *(_in + 3);
		byte_5 = *(_in + 4);
		if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 || (byte_4 & 0xC0) != 0x80 
			|| (byte_5 & 0xC0) != 0x80) {
			return 0;
		}
		*(p + 0) = (byte_4 << 6) + (byte_5 & 0x3F);  
		*(p + 1) = (byte_3 << 4) + ((byte_4 >> 2) & 0x0F);  
		*(p + 2) = (byte_2 << 2) + ((byte_3 >> 4) & 0x03);  
		*(p + 3) = (byte_1 << 6); 
		break;
	case 6://1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		byte_1 = *_in;
		byte_2 = *(_in + 1);
		byte_3 = *(_in + 2);
		byte_4 = *(_in + 3);
		byte_5 = *(_in + 4);
		byte_6 = *(_in + 5);
		if ((byte_2 & 0xC0) != 0x80 || (byte_3 & 0xC0) != 0x80 || 
			(byte_4 & 0xC0) != 0x80 || (byte_5 & 0xC0) != 0x80 || (byte_6 & 0xC0) != 0x80) {
			return 0;
		}
		*(p + 0) = (byte_5 << 6) + (byte_6 & 0x3F);  
		*(p + 1) = (byte_5 << 4) + ((byte_6 >> 2) & 0x0F);  
		*(p + 2) = (byte_3 << 2) + ((byte_4 >> 4) & 0x03);  
		*(p + 3) = ((byte_1 << 6) & 0x40) + (byte_2 & 0x3F); 
		break;
	}
	return utfbytes;
}

int isutf8format(const char *_in)
{
	int utfbytes = 0;
	unsigned char c = 0;
	const char *p = _in;
	if (NULL == _in)
	{
		return 0;
	}
	c = (unsigned char)*p++;
	if (utfbytes == 0)
	{
		if (0 == (utfbytes = get_utf8_size(c)))
		{
			return 0;
		}
	}
	--utfbytes;
	while (utfbytes > 0 && p)
	{
		c = (unsigned char)*p;
		if ((c & 0xC0) != 0x80)
		{
			return 0;
		}
		--utfbytes;
		++p;
	}
	return 1;
}


unsigned char utf8_look_for_table[] =
{
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};

#define UTFLEN(x) utf8_look_for_table[(x)]

//计算_in字符数目
int get_utf8_len(char *_in)
{
	int clen = strlen(_in);
	int len = 0;
	for(char *ptr = _in; *ptr != 0 && len < clen; len++, ptr += UTFLEN((unsigned char)*ptr));
	return len;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值