local socket = require "socket"
local function utf8len(ch)
if not ch then
return -1
end
if ch < 0x80 then
return 1
elseif ch < 0xC0 then
return -1
elseif ch < 0xE0 then
return 2
elseif ch < 0xF0 then
return 3
elseif ch < 0xF8 then
return 4
elseif ch < 0xFC then
return 5
elseif ch < 0xFE then
return 6
else
return -1
end
end
local function getutf8tbl(input)
if not input then
return nil, nil
end
local tbl = {}
local tbllen = {}
local len = #input
local i = 1
while i <= len do
local j = utf8len(string.byte(string.sub(input, i, i)))
if j <= 0 or i + j - 1 > len then
return nil, nil
end
table.insert(tbl, string.sub(input, i, i + j - 1))
table.insert(tbllen, j)
i = i + j
end
return tbl, tbllen
end
local f0 = socket.gettime()
local data = {}
local maxlen = 0
local firstword = {}
for line in io.lines("forbidden_words.txt") do
local len = string.len(line)
if data[len] == nil then
data[len] = {}
end
data[len][line] = true
if len > maxlen then
maxlen = len
end
local wordlen = utf8len(string.byte(string.sub(line, 1, 1)))
if wordlen > 0 then
firstword[string.sub(line, 1, wordlen)] = true
end
end
local f1 = socket.gettime()
print(f1 - f0)
local fout = io.open("out.txt", "w")
for str in io.lines("test.txt") do
local t0 = socket.gettime()
local tbl, tbllen = getutf8tbl(str)
if not tbl then
print(str .. " input is invalid")
end
local count = 0
local len = #tbl
for i = 1, len do
local wordlen = 0
if tbl[i] ~= '*' and firstword[tbl[i]] then
for j = 1, len - i + 1 do
wordlen = wordlen + tbllen[i + j -1]
if wordlen > maxlen then --optimization
break
end
local t = data[wordlen]
if t then
local word = table.concat(tbl, nil, i, i + j - 1)
count = count + 1
if t[word] then
for k = i, i + j - 1 do
tbl[k] = '*'
end
break
end
end
end
end
end
local t1 = socket.gettime()
fout:write(table.concat(tbl), t1 - t0, '\n')
print(count, t1 - t0)
end
fout:close()
lua之屏蔽字替换为 '*'
最新推荐文章于 2024-06-20 16:26:00 发布
该博客介绍了如何使用Lua处理UTF-8编码的字符串,通过读取'forbidden_words.txt'文件获取禁词列表,并对'test.txt'中的文本进行逐行处理。当找到禁词时,将其替换为'*'并记录替换次数。文章还展示了时间复杂度相关的性能统计。
摘要由CSDN通过智能技术生成