解决string.len 处理 utf-8 中文字符不正确的问题。因为这个返回的结果是字节数(不是字符数),比如 UTF-8 里一个中文得到 3,在 GBK 里一个中文得到2。
据说是有个 string.utf8len 还是 string.lenutf8 可以用,但是试了一下发现貌似没有这个函数
幸亏GitHub上有大神贴出了代码,效果非常好
https://github.com/airtonix/ouf_tags/blob/master/utf8.lua
贴出主要部分
local function utf8charbytes (s, i)
-- argument defaults
i = i or 1
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
end
if type(i) ~= "number" then
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
end
local c = s:byte(i)
-- determine bytes needed for character, based on RFC 3629
-- validate byte 1
if c > 0 and c <= 127 then
-- UTF8-1
return 1
elseif c >= 194 and c <= 223 then
-- UTF8-2
local c2 = s:byte(i + 1)
if not c2 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
return 2
elseif c >= 224 and c <= 239 then
-- UTF8-3
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
if not c2 or not c3 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c == 224 and (c2 < 160 or c2 > 191) then
error("Invalid UTF-8 character")
elseif c == 237 and (c2 < 128 or c2 > 159) then
error("Invalid UTF-8 character")
elseif c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 3
if c3 < 128 or c3 > 191 then
error("Invalid UTF-8 character")
end
return 3
elseif c >= 240 and c <= 244 then
-- UTF8-4
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
local c4 = s:byte(i + 3)
if not c2 or not c3 or not c4 then
error("UTF-8 string terminated early")
end
-- validate byte 2
if c == 240 and (c2 < 144 or c2 > 191) then
error("Invalid UTF-8 character")
elseif c == 244 and (c2 < 128 or c2 > 143) then
error("Invalid UTF-8 character")
elseif c2 < 128 or c2 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 3
if c3 < 128 or c3 > 191 then
error("Invalid UTF-8 character")
end
-- validate byte 4
if c4 < 128 or c4 > 191 then
error("Invalid UTF-8 character")
end
return 4
else
error("Invalid UTF-8 character")
end
end
local function utf8len (s)
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
end
local pos = 1
local bytes = s:len()
local len = 0
while pos <= bytes and len ~= chars do
local c = s:byte(pos)
len = len + 1
pos = pos + utf8charbytes(s, pos)
end
if chars ~= nil then
return pos - 1
end
return len
end
print(utf8len("中国人"))