记录字符串UTF-8长度查询,单字符分隔,截取UTF-8字符串
思路:遍历字符串每个字符,如果字符在UFT-8中文区间范围就认为有中文。
代码:
local obj_StringHelper = {
---[[
--- utf-8编码规则
--- 单字节 - 0起头
--- 1字节 0xxxxxxx 0 - 127
--- 多字节 - 第一个字节n个1加1个0起头
--- 2 字节 110xxxxx 192 - 223
--- 3 字节 1110xxxx 224 - 239
--- 4 字节 11110xxx 240 - 247
--- 可能有1-4个字节
---]]
getByteNumber = function(self, char)
if not char then
return 0
end
local code = string.byte(char)
if code < 127 then
return 1
elseif code <= 223 then
return 2
elseif code <= 239 then
return 3
elseif code <= 247 then
return 4
end
return 0
end,
--- 获取字符串的UTF-8字符长度
lenUTF8 = function(self, str)
local tempStr = str
-- 字符记数
local index = 0
-- 字符的字节记数
local bytes = 0
while string.len(tempStr) > 0 do
bytes = bytes + self:getByteNumber(tempStr)
tempStr = string.sub(tempStr, bytes + 1)
index = index + 1
end
return index
end,
--- 截取UTF-8字符串
subUTF8 = function(self, str, startIndex, endIndex)
local tempStr = str
local byteStart = 1 -- string.sub截取的开始位置
local byteEnd = -1 -- string.sub截取的结束位置
local index = 0 -- 字符记数
local bytes = 0 -- 字符的字节记数
startIndex = math.max(startIndex, 1)
endIndex = endIndex or -1
while string.len(tempStr) > 0 do
if index == startIndex - 1 then
byteStart = bytes + 1
elseif index == endIndex then
byteEnd = bytes
break
end
bytes = bytes + self:getByteNumber(tempStr)
tempStr = string.sub(str, bytes + 1)
index = index + 1
end
return string.sub(str, byteStart, byteEnd)
end,
--- UTF-8字符串分离成为一个个的单字符
getCharArrByStr = function(self, str)
local result = {}
local tempStr = str
local bytes = 0
while string.len(tempStr) > 0 do
bytes = self:getByteNumber(tempStr)
table.insert(result, string.sub(tempStr, 1, bytes))
tempStr = string.sub(tempStr, bytes + 1)
end
return result
end
}
---[[
--- max/min {a,b,c,...}
--- 作用: 一个table是否在min table,和max table之间; 如果a相等就比较b,依次类推
--- @param min table 最小范围
--- @param max table 最大范围
--- @param tb table 需要比较的table
---]]
local function isScope(min, max, tb)
local lengthTb = function(t)
local res = 0
for k, v in pairs(t) do
res = res + 1
end
return res
end
for i, v in pairs(tb) do
if min[i] < v and v < max[i] then
return true
elseif min[i] == v or v == max[i] then
if i == lengthTb(tb) then
return true
end
else
return false
end
end
return false
end
---[[
--- utf-8
--- 1 bytes 0xxxxxxx
--- 2 bytes 110xxxxx 10xxxxxx
--- 3 bytes 1110xxxx 10xxxxxx 10xxxxxx
--- 4 bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
--- 下方为理论,一般认为只有4位
--- 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
--- 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
--- 传入参数为字符串,如"2E80",最多6位
--- 如下:
--- 传入"4E00",0x4E00 用二进制表示为 0100 1110 0000 0000,UTF-8码就是 1110[0100] 10[1110][00 (10)00][0000],传出{228, 184, 128}
--- 传入"9FA5",0x4E00 用二进制表示为 1001 1111 1010 0101,UTF-8码就是 1110[1001] 10[1111][10 (10)10][0101],传出{233, 190, 165}
--- [\u4e00-\u9fa5] 汉字
--- [\u2E80-\u9FFF] 汉字 + 日韩中文
--- \u2E80 -> 0010 1110 0100 0000 -> 1110[0010] 10[1110][01 (10)00][0000] -> {226, 185, 128}
--- \u9FFF -> 1001 1111 1111 1111 -> 1110[1001] 10[1111][11 (10)11][1111] -> {233, 191, 191}
---
--- 实现逻辑: 当前汉字都在3个字节中,lua中使用string.byte()获取byte值(code point),可以获取3个,对应3个字节,判断在语言编码范围,即为该国文字
--- @description: 返回一个table,值为字符串的byte值
--- @param str
---]]
local obj_findChinese = {
chineseRangeMin = { 226, 185, 128 },
chineseRangeMax = { 233, 191, 191 },
hasChinese = function(self, str)
local result = false
--- 分隔字符串,然后单个字符依次校验
local charArr = obj_StringHelper:getCharArrByStr(str)
for k, v in pairs(charArr) do
local charLen = obj_StringHelper:getByteNumber(v)
local charByteCode = { string.byte(v, 1, charLen) }
--- 在中文范围
if isScope(self.chineseRangeMin, self.chineseRangeMax, charByteCode) then
return true
end
end
return result
end
}
print(obj_StringHelper:getByteNumber(""))--3
print(obj_findChinese:hasChinese(""))--false --- 特殊字符零宽间隔Unicode编号[U+200B]
print(obj_findChinese:hasChinese("中文")) ---true
print(obj_findChinese:hasChinese("ssssssdfamaolnz,a.spf.zZzhog.。")) ---true
print(obj_findChinese:hasChinese("ssssssdfamaolnz,a.spf.zZzhog")) ---false