unicode-utf8转换

最新推荐文章于 2024-04-10 10:43:38 发布

小黑屋1024

最新推荐文章于 2024-04-10 10:43:38 发布

阅读量817

点赞数

分类专栏：字符编码造轮子 c++ 文章标签： c++ 字符编码

本文链接：https://blog.csdn.net/weixin_40953784/article/details/131424338

版权

c++ 同时被 3 个专栏收录

8 篇文章 0 订阅

订阅专栏

字符编码

5 篇文章 0 订阅

订阅专栏

造轮子

2 篇文章 0 订阅

订阅专栏

文章提供了一对C++函数，用于将Unicode值转换为UTF-8编码的字符串，以及将UTF-8字符串转换回Unicode值。代码详细展示了转换过程，并且经过了性能测试。在特定条件下，使用右值引用可提升转换速度。

摘要由CSDN通过智能技术生成

代码

#ifndef UNICONVERT_H
#define UNICONVERT_H

#include <string>

using std::string;

namespace unicodeCvt{
	typedef unsigned int uint;

	//0x80 -> 10xx xxxx     BF
	//0xC0 -> 110x xxxx		1F
	//0xE0 -> 1110 xxxx		0F
	//0xF0 -> 1111 0xxx		07

	void func(uint unic,int num,string &str) {
		for(int i= num;i>=0;i--)
			str.append(1, static_cast<char>((0x80) | (unic>>6*i) & 0xBF));
	}

	string utf8str{};
	//单字符的unicode值转utf8编码的字符串
	string &&unicode2Utf8(uint unic)
	{
		utf8str.clear();
		if (unic < 0x80) {
			utf8str.append(1, static_cast<char>(unic));
		}
		else if (unic > 0x7F && unic < 0x0800) {
			utf8str.append(1, static_cast<char>(0xC0 | ((unic >> 6) & 0x1F)));
			func(unic, 0, utf8str);
		}
		else if (unic > 0x07FF && unic < 0x010000) {
			utf8str.append(1, static_cast<char>(0xE0 | ((unic >> 12) & 0x0F)));
			func(unic, 1, utf8str);
		}
		else if (unic > 0xFFFF && unic < 0x10FFFF) {
			utf8str.append(1, static_cast<char>(0xF0 | ((unic >> 18) & 0x07)));
			func(unic, 2, utf8str);
		}
		return std::move(utf8str);
	}

	//utf8编码的单字符转对应的unicode值。
	uint utf82Unicode(const string &str)
	{
		int len = str.size();
		if (len > 4 || len <= 0)
			return 0;
		uint origin = 0;
		
		//翻转
		char arr[4]{};
		for (int i = 0; i < len; i++) {
			
			memcpy(arr+len-i-1, &str.at(i), 1);
		}
		memcpy(&origin, &arr, 4);

		uint unicode = 0;
		switch (len)
		{
		case 1:
			return origin;
		case 2:
		{
			for (int i = 0; i < 6; i++)
				unicode |= (0x01 << i)&origin;
			for (int i = 8; i < 13; i++)
				unicode |= ((0x01 << i)&origin) >> 2;
			return unicode;
		}
		case 3:
		{
			for (int i = 0; i < 6; i++)
				unicode |= (0x01 << i)&origin;
			for (int i = 8; i < 14; i++)
				unicode |= ((0x01 << i)&origin) >> 2;
			for (int i = 16; i < 20; i++)
				unicode |= ((0x01 << i)&origin) >> 4;
			return unicode;
		}
		case 4:
		{
			for (int i = 0; i < 6; i++)
				unicode |= (0x01 << i)&origin;
			for (int i = 8; i < 14; i++)
				unicode |= ((0x01 << i)&origin) >> 2;
			for (int i = 16; i < 22; i++)
				unicode |= ((0x01 << i)&origin) >> 4;
			for (int i = 24; i < 27; i++)
				unicode |= ((0x01 << i)&origin) >> 6;
			return unicode;
		}
		default:
			return 0;
		}
	}

};
#endif // UNICONVERT_H

调用如下：

#define UNIC2UTF8
void preformanceTest()
{
#ifdef ABC
	//右值能提升很多倍速度
	for (uint i = 0; i < 0x10FFFF; i++)
	{
		string &&str = unicode2Utf8(i);
	}
#else
	for (uint i = 0; i < 0x10FFFF; i++)
	{
		string &&str = unicode2Utf8(i);
		uint unic = utf82Unicode(str);
	}
#endif // UNIC2UTF8


}

说明

unicode2Utf8函数将unicode值转为对应的utf8编码的字符串
utf82Unicode函数将utf8编码的字符串转为unicode值
两个函数性能都经过测试验证，目前是我能够优化的极限。
两者的转换原理则依据下表(详情参考字符编码)：

Unicode码位范围	utf-8编码二进制	内存空间
0x00-0x7F	0xxxxxxx	一字节
0x80-0x07FF	110xxxxx 10xxxxxx	两字节
0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx	三字节
0x010000-0x10FFFF	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx	四字节