实现原理
utf-8字符的头有四种,分别为:1字节:0xxxxxxx;2字节:110xxxxx 10xxxxxx;3字节:1110xxxx 10xxxxxx 10xxxxxx;4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx。通过头判断即可。
参考代码
#include <iostream>
#include <string>
#include <vector>
#ifndef LOGOUT
#define LOGOUT printf
#endif
/*
@brife 将UTF-8字符串分割成单个字符
@param des 保存字符串
@param sou 源字符串
*/
void splitString(std::vector<std::string>& des, const std::string& sou)
{
//utf-8字符头
// 1字节:0xxxxxxx
// 2字节:110xxxxx 10xxxxxx
// 3字节:1110xxxx 10xxxxxx 10xxxxxx
// 4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//当前位置
uint32_t pos = 0;
uint32_t orgPos = 0;
do {
//位置跳1Byte 后一个条件是过滤掉结尾\0符号
if (0 == (0X80 & sou[pos]) && 0X00 != sou[pos])
{
pos += 1;
}
else if (0XC0 == (0XF0 & sou[pos]))
{
pos += 2;
}
else if (0XE0 == (0XF0 & sou[pos]))
{
pos += 3;
}
else if (0XF0 == (0XF0 & sou[pos]))
{
pos += 4;
}
else
{
LOGOUT("--%d-- --%s-- pos:%u utf-8字符串判断出错!!!\n", __LINE__, __FILE__, pos);
return;
}
//保存单个文本
des.push_back(sou.substr(orgPos, pos- orgPos));
orgPos = pos;
} while (pos < sou.size());
}
/*
@brife 判断当前字符是否为UTF-8字符
@param sou 源字符
@return 1~4 当前字符是UTF-8字符;0 当前字符不为UTF-8字符
*/
const int judgeWhetherUtf8Character(const std::string& sou)
{
if (0 == sou.size())
{
return 0;
}
//utf-8字符头
// 1字节:0xxxxxxx
// 2字节:110xxxxx 10xxxxxx
// 3字节:1110xxxx 10xxxxxx 10xxxxxx
// 4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
uint32_t pos = 0;
//位置跳1Byte 后一个条件是过滤掉结尾\0符号
if (0 == (0X80 & sou[pos]) && 0X00 != sou[pos])
{
return 1;
}
else if (0XC0 == (0XF0 & sou[pos]))
{
return 2;
}
else if (0XE0 == (0XF0 & sou[pos]))
{
return 3;
}
else if (0XF0 == (0XF0 & sou[pos]))
{
return 4;
}
return 0;
}
int main(void)
{
std::vector<std::string> des;
splitString(des, "hello0123456789");
for (auto it : des)
{
std::cout << it << " ";
}
return 0;
}