LUA UTF-8 查找字符串是否存在中文

qq_30620793

已于 2022-12-16 17:06:30 修改

阅读量708

点赞数 1

分类专栏： lua 文章标签： lua 开发语言

于 2022-12-14 15:16:30 首次发布

本文链接：https://blog.csdn.net/qq_30620793/article/details/128316729

版权

lua 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

记录字符串UTF-8长度查询，单字符分隔，截取UTF-8字符串
思路:遍历字符串每个字符，如果字符在UFT-8中文区间范围就认为有中文。
代码:

local obj_StringHelper = {
    ---[[
    ---  utf-8编码规则
    ---  单字节 - 0起头
    ---     1字节  0xxxxxxx   0 - 127
    ---  多字节 - 第一个字节n个1加1个0起头
    ---     2 字节 110xxxxx   192 - 223
    ---     3 字节 1110xxxx   224 - 239
    ---     4 字节 11110xxx   240 - 247
    ---  可能有1-4个字节
    ---]]
    getByteNumber = function(self, char)
        if not char then
            return 0
        end
        local code = string.byte(char)
        if code < 127 then
            return 1
        elseif code <= 223 then
            return 2
        elseif code <= 239 then
            return 3
        elseif code <= 247 then
            return 4
        end
        return 0
    end,
    --- 获取字符串的UTF-8字符长度
    lenUTF8 = function(self, str)
        local tempStr = str
        -- 字符记数
        local index = 0
        -- 字符的字节记数
        local bytes = 0

        while string.len(tempStr) > 0 do
            bytes = bytes + self:getByteNumber(tempStr)
            tempStr = string.sub(tempStr, bytes + 1)
            index = index + 1
        end
        return index
    end,
    --- 截取UTF-8字符串
    subUTF8 = function(self, str, startIndex, endIndex)
        local tempStr = str
        local byteStart = 1 -- string.sub截取的开始位置
        local byteEnd = -1 -- string.sub截取的结束位置
        local index = 0  -- 字符记数
        local bytes = 0  -- 字符的字节记数

        startIndex = math.max(startIndex, 1)
        endIndex = endIndex or -1
        while string.len(tempStr) > 0 do
            if index == startIndex - 1 then
                byteStart = bytes + 1
            elseif index == endIndex then
                byteEnd = bytes
                break
            end
            bytes = bytes + self:getByteNumber(tempStr)
            tempStr = string.sub(str, bytes + 1)

            index = index + 1
        end
        return string.sub(str, byteStart, byteEnd)
    end,
    --- UTF-8字符串分离成为一个个的单字符
    getCharArrByStr = function(self, str)
        local result = {}
        local tempStr = str
        local bytes = 0
        while string.len(tempStr) > 0 do
            bytes = self:getByteNumber(tempStr)
            table.insert(result, string.sub(tempStr, 1, bytes))
            tempStr = string.sub(tempStr, bytes + 1)
        end

        return result
    end
}

---[[
--- max/min {a,b,c,...}
--- 作用: 一个table是否在min table,和max table之间; 如果a相等就比较b,依次类推
--- @param min table 最小范围
--- @param max table 最大范围
--- @param tb table 需要比较的table
---]]
local function isScope(min, max, tb)
    local lengthTb = function(t)
        local res = 0
        for k, v in pairs(t) do
            res = res + 1
        end
        return res
    end
    for i, v in pairs(tb) do
        if min[i] < v and v < max[i] then
            return true
        elseif min[i] == v or v == max[i] then
            if i == lengthTb(tb) then
                return true
            end
        else
            return false
        end
    end
    return false
end

---[[
--- utf-8
--- 1 bytes 0xxxxxxx
--- 2 bytes 110xxxxx 10xxxxxx
--- 3 bytes 1110xxxx 10xxxxxx 10xxxxxx
--- 4 bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
--- 下方为理论,一般认为只有4位
--- 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
--- 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
---   传入参数为字符串，如"2E80"，最多6位
---   如下：
---     传入"4E00"，0x4E00 用二进制表示为 0100 1110 0000 0000，UTF-8码就是 1110[0100] 10[1110][00 (10)00][0000]，传出{228, 184, 128}
---     传入"9FA5"，0x4E00 用二进制表示为 1001 1111 1010 0101，UTF-8码就是 1110[1001] 10[1111][10 (10)10][0101]，传出{233, 190, 165}
---   [\u4e00-\u9fa5] 汉字
---   [\u2E80-\u9FFF] 汉字 + 日韩中文
---   \u2E80 -> 0010 1110 0100 0000 -> 1110[0010] 10[1110][01 (10)00][0000] -> {226, 185, 128}
---   \u9FFF -> 1001 1111 1111 1111 -> 1110[1001] 10[1111][11 (10)11][1111] -> {233, 191, 191}
---
---   实现逻辑: 当前汉字都在3个字节中，lua中使用string.byte()获取byte值(code point),可以获取3个，对应3个字节，判断在语言编码范围，即为该国文字
---   @description: 返回一个table，值为字符串的byte值
---   @param str
---]]
local obj_findChinese = {
    chineseRangeMin = { 226, 185, 128 },
    chineseRangeMax = { 233, 191, 191 },
    hasChinese = function(self, str)
        local result = false
        --- 分隔字符串,然后单个字符依次校验
        local charArr = obj_StringHelper:getCharArrByStr(str)
        for k, v in pairs(charArr) do
            local charLen = obj_StringHelper:getByteNumber(v)
            local charByteCode = { string.byte(v, 1, charLen) }
            --- 在中文范围
            if isScope(self.chineseRangeMin, self.chineseRangeMax, charByteCode) then
                return true
            end
        end
        return result
    end
}

print(obj_StringHelper:getByteNumber(""))--3
print(obj_findChinese:hasChinese(""))--false --- 特殊字符零宽间隔Unicode编号[U+200B]
print(obj_findChinese:hasChinese("中文"))  ---true
print(obj_findChinese:hasChinese("ssssssdfamaolnz,a.spf.zZzhog.。"))  ---true
print(obj_findChinese:hasChinese("ssssssdfamaolnz,a.spf.zZzhog"))  ---false