代码
#ifndef UNICONVERT_H
#define UNICONVERT_H
#include <string>
using std::string;
namespace unicodeCvt{
typedef unsigned int uint;
//0x80 -> 10xx xxxx BF
//0xC0 -> 110x xxxx 1F
//0xE0 -> 1110 xxxx 0F
//0xF0 -> 1111 0xxx 07
void func(uint unic,int num,string &str) {
for(int i= num;i>=0;i--)
str.append(1, static_cast<char>((0x80) | (unic>>6*i) & 0xBF));
}
string utf8str{};
//单字符的unicode值转utf8编码的字符串
string &&unicode2Utf8(uint unic)
{
utf8str.clear();
if (unic < 0x80) {
utf8str.append(1, static_cast<char>(unic));
}
else if (unic > 0x7F && unic < 0x0800) {
utf8str.append(1, static_cast<char>(0xC0 | ((unic >> 6) & 0x1F)));
func(unic, 0, utf8str);
}
else if (unic > 0x07FF && unic < 0x010000) {
utf8str.append(1, static_cast<char>(0xE0 | ((unic >> 12) & 0x0F)));
func(unic, 1, utf8str);
}
else if (unic > 0xFFFF && unic < 0x10FFFF) {
utf8str.append(1, static_cast<char>(0xF0 | ((unic >> 18) & 0x07)));
func(unic, 2, utf8str);
}
return std::move(utf8str);
}
//utf8编码的单字符转对应的unicode值。
uint utf82Unicode(const string &str)
{
int len = str.size();
if (len > 4 || len <= 0)
return 0;
uint origin = 0;
//翻转
char arr[4]{};
for (int i = 0; i < len; i++) {
memcpy(arr+len-i-1, &str.at(i), 1);
}
memcpy(&origin, &arr, 4);
uint unicode = 0;
switch (len)
{
case 1:
return origin;
case 2:
{
for (int i = 0; i < 6; i++)
unicode |= (0x01 << i)&origin;
for (int i = 8; i < 13; i++)
unicode |= ((0x01 << i)&origin) >> 2;
return unicode;
}
case 3:
{
for (int i = 0; i < 6; i++)
unicode |= (0x01 << i)&origin;
for (int i = 8; i < 14; i++)
unicode |= ((0x01 << i)&origin) >> 2;
for (int i = 16; i < 20; i++)
unicode |= ((0x01 << i)&origin) >> 4;
return unicode;
}
case 4:
{
for (int i = 0; i < 6; i++)
unicode |= (0x01 << i)&origin;
for (int i = 8; i < 14; i++)
unicode |= ((0x01 << i)&origin) >> 2;
for (int i = 16; i < 22; i++)
unicode |= ((0x01 << i)&origin) >> 4;
for (int i = 24; i < 27; i++)
unicode |= ((0x01 << i)&origin) >> 6;
return unicode;
}
default:
return 0;
}
}
};
#endif // UNICONVERT_H
调用如下:
#define UNIC2UTF8
void preformanceTest()
{
#ifdef ABC
//右值能提升很多倍速度
for (uint i = 0; i < 0x10FFFF; i++)
{
string &&str = unicode2Utf8(i);
}
#else
for (uint i = 0; i < 0x10FFFF; i++)
{
string &&str = unicode2Utf8(i);
uint unic = utf82Unicode(str);
}
#endif // UNIC2UTF8
}
说明
-
unicode2Utf8函数将unicode值转为对应的utf8编码的字符串
-
utf82Unicode函数将utf8编码的字符串转为unicode值
-
两个函数性能都经过测试验证,目前是我能够优化的极限。
-
两者的转换原理则依据下表(详情参考字符编码):
Unicode码位范围 | utf-8编码二进制 | 内存空间 |
---|---|---|
0x00-0x7F | 0xxxxxxx | 一字节 |
0x80-0x07FF | 110xxxxx 10xxxxxx | 两字节 |
0x0800-0xFFFF | 1110xxxx 10xxxxxx 10xxxxxx | 三字节 |
0x010000-0x10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 四字节 |