lua中从字符串中提取汉字
1、utf8_to_unicode:将utf8字符转换为unicode格式的数值,用于和汉字的unicode编码比较大小
2、GetVChineseChar:遍历字符串中的每个字符,获取unicode值通过比较从字符串中提取汉字
注:如果是lua5.1,需要使用bitOp插件,5.2则require “bit32”, 5.3已经支持位处理功能
如果是5.1或5.2,位操作符合&,|等需要替换为对应的band,bor等函数
function M.GetVChineseChar(str)
local resultChar = {}
local i = 1
while true do
local curByte = string.byte(str, i)
local byteCount = 1
if curByte > 239 then
byteCount = 4 -- 4字节字符
elseif curByte > 223 then
byteCount = 3 -- 汉字
elseif curByte > 128 then
byteCount = 2 -- 双字节字符
elseif curByte == 10 then
byteCount = 1 -- 单字节字符
else
byteCount = 1 -- 单字节字符
end
local subStr = string.sub(str,i, i + byteCount -1 )
local charUnicodeNum = Utils.utf8_to_unicode(subStr)
--[[if curByte == 32 or --空格
(curByte > 47 and curByte < 58) or --数字
(curByte > 96 and curByte < 123) or --小写字母
(curByte > 64 and curByte < 91) --大写字母
]]--
if charUnicodeNum >= 19968 and charUnicodeNum <= 40891 then --汉字/u4E00 -- /u9fbb
table.insert(resultChar,subStr)
end
i = i + byteCount
if i > #str then
return table.concat(resultChar)
end
end
end
--将utf8字符转换为unicode编码格式对应的十进制数值
function M.utf8_to_unicode(convertStr)
if type(convertStr)~="string" then
return convertStr
end
local resultDec= 0
local i=1
local num1=string.byte(convertStr,i)
if num1 ~= nil then
local tempVar1,tempVar2 = 0, 0
if num1 >= 0x00 and num1 <= 0x7f then
tempVar1=num1
tempVar2=0
elseif num1 & 0xe0 == 0xc0 then
local t1 = 0
local t2 = 0
t1 = num1 & 0xff >> 3
i=i+1
num1= string.byte(convertStr,i)
t2 = num1 & 0xff>> 2
tempVar1= t2 | ((t1 & (0xff >> 6)) << 6)
tempVar2= t1 >> 2
elseif num1 & 0xf0== 0xe0 then
local t1 = 0
local t2 = 0
local t3 = 0
t1 = num1 & (0xff >> 3)
i=i+1
num1= string.byte(convertStr,i)
t2 = num1 & (0xff >> 2)
i=i+1
num1= string.byte(convertStr,i)
t3 = num1 & (0xff >> 2)
tempVar1=((t2 & (0xff >> 6)) << 6) | t3
tempVar2=(t1 << 4) | (t2 >> 2)
end
resultDec = tempVar2 * 256 + tempVar1
end
return resultDec
end