统计是大数据的基础,先学习一些简单基础的算法,
统计字符出现的次数,可以检测重复,统计概率。
下面直接看代码。
--utf8字符串分割成单字表
local function filter_spec_chars(s)
local ss = {}
local k = 1
while true do
if k > #s then break end
local c = string.byte(s,k)
if not c then break end
if c<192 then
if (c>=48 and c<=57) or (c>= 65 and c<=90) or (c>=97 and c<=122) then
table.insert(ss, string.char(c))
end
k = k + 1
elseif c<224 then
k = k + 2
elseif c<240 then
if c>=228 and c<=233 then
local c1 = string.byte(s,k+1)
local c2 = string.byte(s,k+2)
if c1 and c2 then
local a1,a2,a3,a4 = 128,191,128,191
if c == 228 then a1 = 184
elseif c == 233 then a2,a4 = 190,c1 ~= 190 and 191 or 165
end
if c1>=a1 and c1<=a2 and c2>=a3 and c2<=a4 then
table.insert(ss, string.char(c,c1,c2))
end
end
end
k = k + 3
elseif c<248 then
k = k + 4
elseif c<252 then
k = k + 5
elseif c<254 then
k = k + 6
end
end
return ss
end
--utf8字符串长度
function string.utf8len(input)
local len = string.len(input)
local left = len
local cnt = 0
local arr = {0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}
while left ~= 0 do
local tmp = string.byte(input, -left)
local i = #arr
while arr[i] do
if tmp >= arr[i] then
left = left - i
break
end
i = i - 1
end
cnt = cnt + 1
end
return cnt
end
--查找表中是否有值为value的k
--此方法可以改成查找表中是否有值为value的v
function table.keyof(hashtable, value)
for k, v in pairs(hashtable) do
if k == value then
return k
end
end
return nil
end
--secondName是一个词语集合,如果只统计这些词语或者单词的出现次数,那就用不到上面的方法了
local function HashFold ()
local getTime = os.time()
local hash = {["李强"] = 1}
for k,v in pairs(secondName) do
local list = filter_spec_chars(v)
for j,i in pairs(list) do
-- print("logging 2",i)
if table.keyof(hash,i) then
hash[i] = hash[i] + 1
else
hash[i] = 1
end
-- print("logging 3",hash[i])
end
end
print("GameToos.getTimeiv() HashFold :",os.time() - getTime)
local num = 200
for k,v in pairs(hash) do
if v>=num then
print("出现次数大于"..num.."100次的字符",k,v)
end
end
end
HashFold ()
如果有什么问题,请多多指点。