-- 字符串保存到table function stringToTable(s) local tb = {} --[[ UTF8的编码规则: 1. 字符的第一个字节范围: 0x00—0x7F(0-127),或者 0xC2—0xF4(194-244); UTF8 是兼容 ascii 的,所以 0~127 就和 ascii 完全一致 2. 0xC0, 0xC1,0xF5—0xFF(192, 193 和 245-255)不会出现在UTF8编码中 3. 0x80—0xBF(128-191)只会出现在第二个及随后的编码中(针对多字节编码,如汉字) ]] for utfChar in string.gmatch(s, "[%z\1-\127\194-\244][\128-\191]*") do table.insert(tb, utfChar) end return tb end -- 获取字符串长度,英文字符为一个单位长, 中文字符为2个单位长 function getUTFLen(s) local sTable = stringToTable(s) local len = 0 local charLen = 0 for i=1,#sTable do local utfCharLen = string.len(sTable[i]) -- 长度大于1可认为为中文 if utfCharLen > 1 then charLen = 2 --将charLen设为1,可获取中文,英文的字符个数,以下举例,将其方法命名为:function getNewUTFLen(s) else charLen = 1 end -- charLen = 1 len = len + charLen end return len end -- 获取字符串长度,不管中文,英文一律一个字符为1单位长 function getNewUTFLen(s) local sTable = stringToTable(s) local len = 0 local charLen = 0 for i = 1, #sTable do local utfCharLen = string.len(sTable[i]) if utfCharLen > 1 then charLen = 1 -- 修改为1 else charLen = 1 end len = len + charLen end return len end -- 获取中文,英文字符个数 local str = "一二三@#[]【】789&*():" print(getNewUTFLen(str)) -- 17 local s = "①贰aA#}。" local sTab = stringToTable(s) for i = 1, #sTab do local outStr = string.format("sTab index:%d,str:\"%s\",Len:%s,byte:%d", i, sTab[i], string.len(sTab[i]), string.byte(sTab[i]) ) print(outStr) end print("#sTab = " .. #sTab) print("getUTFLen = " .. getUTFLen(s))
输出结果:
sTab index:1,str:"①",Len:3,byte:226 sTab index:2,str:"贰",Len:3,byte:232 sTab index:3,str:"a",Len:1,byte:97 sTab index:4,str:"A",Len:1,byte:65 sTab index:5,str:"#",Len:1,byte:35 sTab index:6,str:"}",Len:1,byte:125 sTab index:7,str:"。",Len:3,byte:227 #sTab = 7 getUTFLen = 10