计算utf8字符串长度，截取utf8字符串的部分字符串，cpp源码

最新推荐文章于 2023-05-15 09:05:27 发布

御风@户外

最新推荐文章于 2023-05-15 09:05:27 发布

阅读量514

点赞数

分类专栏： cpp 文章标签： utf

本文链接：https://blog.csdn.net/weixin_43172531/article/details/103787960

版权

cpp 专栏收录该内容

114 篇文章 1 订阅

订阅专栏

utf8编码是Unicode编码的一种。
因为utf8是单字节序列，所以不存在大小尾，不存在歧义，
utf8支持的字符总数也够大(2216757376=128+3264+166464+8646464+464646464+264646464*64)、所有语言都可以采用一种编码方式…
似乎现在所有系统都倾向于默认采用utf8编码形式。
代码文件也建议采用带BOM的utf8格式存储。

当中英…各种语言的字符混合时，求字符长度（非字节长度）、截取多语言字符串成为常见需求，以下为代码

//20170606 UTF8字符串长度，截断等函数
//utf8字符长度1-6，可以根据每个字符第一个字节判断整个字符长度
//0xxxxxxx
//110xxxxx 10xxxxxx
//1110xxxx 10xxxxxx 10xxxxxx
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
//
//定义查找表，长度256，表中的数值表示以此为起始字节的utf8字符长度
static const unsigned char gUtf8LengthTable[] =
{
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};

#define UTFLEN(x)  gUtf8LengthTable[(x)]

//计算utf8字符串的字符数目
inline int getUtf8Length(const char *str)
{
    if (str == NULL)
    {
        return 0;
    }
    int clen = strlen(str);
    int len = 0;

    for (const char *ptr = str;
        *ptr != 0 && len < clen;
        ++len)
    {
        ptr += UTFLEN((unsigned char)*ptr);
    }

    return len;
}

inline int getUtf8Length(const std::string &str)
{
    return getUtf8Length(str.c_str());
}

//截断utf8子串。max_byte_count是以byte为单位时的最大长度，兼容固定长度数组存储字符串的情况,。
//end是不截取的字符，从start开始截取。
inline std::string subUtf8String(const char *str, const int start, int end, int max_byte_count = 0)
{
    if (str == NULL)
    {
        return "";
    }
    if (max_byte_count <= 0)
    {
        max_byte_count = strlen(str) + 1; //不包含最大值。最大值含有结束符。
    }
    int len = getUtf8Length(str);

    if (start >= len)
    {
        return "";
    }
    if (end > len)
    {
        end = len;
    }

    const char *sptr = str;
    for (int i = 0; i < start; ++i)
    {
        sptr += UTFLEN((unsigned char)*sptr);//移到起始位置
    }

    const char *eptr = sptr;//标识终止位置
    const char *eptr_old = eptr;//有可能最后一个字符跳过了max_byte_count,此处标识前一个未超的位置
    int retLen = eptr - sptr;
    for (int i = start; i < end; ++i)
    {
        eptr_old = eptr; //前一个位置
        eptr += UTFLEN((unsigned char)*eptr);//后一个位置
        retLen = eptr - sptr;
        if (retLen >= max_byte_count)
        {
            eptr = eptr_old;
            break;
        }
    }

    retLen = eptr - sptr;
    char *retStr = (char*)malloc(retLen + 1);
    if (retStr == NULL)
    {
        throw std::bad_alloc();
    }
    memcpy(retStr, sptr, retLen);
    retStr[retLen] = 0;
    std::string s = retStr;
    free(retStr);
    return s;
}

inline std::string subUtf8String(const std::string &str, const int start, int end, int max_byte_count = 0)
{
    return subUtf8String(str.c_str(), start, end, max_byte_count);
}