--Author mu 15/10/9
TreeNode = class("TreeNode")
function TreeNode:ctor()
self.data = Dic.new()
end
function TreeNode:getChild(name)
return self.data:GetName(name)
end
function TreeNode:addChild(char)
local node = TreeNode.new()
self.data:SetName(char, node)
node.value = char
node.parent = self
--print("添加", char, node.parent.value)
return node
end
function TreeNode:getFullWord()
local rt = self.value
local node = self.parent
while node do
rt = node.value..rt
node = node.parent
end
return rt
end
function TreeNode:isLeaf()
local index = 0
for k,v in pairs(self.data.dic) do
index = index + 1
end
self._isLeaf = (index == 0)
return self._isLeaf
end
Dic = class("Dic")
--是否是敏感词的词尾字,敏感词树的叶子节点必然是词尾字,父节点不一定是
function Dic:ctor()
self.dic = {}
end
function Dic:GetName(name)
return self.dic[name]
end
function Dic:SetName(name, src)
self.dic[name] = src
end
Sensitive = {}
function Sensitive:regSensitiveWords(words)
--这是一个预处理步骤,生成敏感词索引树,功耗大于查找时使用的方法,但只在程序开始时调用一次。
self.treeRoot = TreeNode.new()
self.treeRoot.value = ""
for k,v in pairs(words) do
local word = v
local len = string.len(word)
local currentBranch = self.treeRoot
for i = 1,len do
local char = string.sub(word,i,i)
local tmp = currentBranch:getChild(char)
if tmp then
currentBranch = tmp
else
currentBranch = currentBranch:addChild(char)
end
end
currentBranch.isEnd = true
end
--print(dump(self.treeRoot))
end
--替换字符串中的敏感词返回
function Sensitive:getReplaceWord(len)
local replaceWord = ""
for i = 1,len do
replaceWord = replaceWord.."*"
end
return replaceWord
end
function Sensitive:replaceSensitiveWord(dirtyWords)
local char
local curTree = self.treeRoot
local childTree
local curEndWordTree
local dirtyWord
local c = 1--循环索引
local endIndex = 1--词尾索引
local headIndex = -1--敏感词词首索引
local target = clone(dirtyWords)
while c <= string.len(target) do
char = string.sub(target,c,c)
childTree = curTree:getChild(char)
if childTree then--在树中遍历
if childTree.isEnd then
curEndWordTree = childTree
endIndex = c
end
if headIndex == -1 then
headIndex = c
end
curTree = childTree
c = c + 1
else--跳出树的遍历
if (curEndWordTree) then -- 如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
dirtyWord = curEndWordTree:getFullWord()
local len = string.len(dirtyWord)
dirtyWords = string.gsub(dirtyWords, string.sub(dirtyWords, endIndex - len, endIndex), self:getReplaceWord(string.len(dirtyWord)))
c = endIndex
elseif curTree ~= self.treeRoot then--如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
c = headIndex
headIndex = -1
end
curTree = self.treeRoot
curEndWordTree = nil
c = c + 1
end
end
--循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if curEndWordTree then
local len = string.len(dirtyWord)
dirtyWord = curEndWordTree:getFullWord()
dirtyWords = string.gsub(dirtyWords, string.sub(dirtyWords, endIndex - len, endIndex), self:getReplaceWord(string.len(dirtyWord)))
end
return dirtyWords
end
function Sensitive:containsBadWords(dirtyWords)
local char
local curTree = self.treeRoot
local childTree
local curEndWordTree
local dirtyWord
local c = 1 --循环索引
local endIndex = 1--词尾索引
local headIndex = -1--敏感词词首索引
while c <= string.len(dirtyWords) do
char = string.sub(dirtyWords, c , c)
childTree = curTree:getChild(char)
if childTree then--在树中遍历
if childTree.isEnd then
curEndWordTree = childTree
endIndex = c
end
if headIndex == -1 then
headIndex = c
end
curTree = childTree
c = c + 1
else--跳出树的遍历
if (curEndWordTree) then--如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
dirtyWord = curEndWordTree:getFullWord()
c = endIndex
return true
elseif curTree ~= self.treeRoot then--如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
c = headIndex
headIndex = -1
end
curTree = self.treeRoot
curEndWordTree = nil
c = c + 1
end
end
--循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if curEndWordTree then
dirtyWord = curEndWordTree:getFullWord()
return true
end
return false
end
TreeNode = class("TreeNode")
function TreeNode:ctor()
self.data = Dic.new()
end
function TreeNode:getChild(name)
return self.data:GetName(name)
end
function TreeNode:addChild(char)
local node = TreeNode.new()
self.data:SetName(char, node)
node.value = char
node.parent = self
--print("添加", char, node.parent.value)
return node
end
function TreeNode:getFullWord()
local rt = self.value
local node = self.parent
while node do
rt = node.value..rt
node = node.parent
end
return rt
end
function TreeNode:isLeaf()
local index = 0
for k,v in pairs(self.data.dic) do
index = index + 1
end
self._isLeaf = (index == 0)
return self._isLeaf
end
Dic = class("Dic")
--是否是敏感词的词尾字,敏感词树的叶子节点必然是词尾字,父节点不一定是
function Dic:ctor()
self.dic = {}
end
function Dic:GetName(name)
return self.dic[name]
end
function Dic:SetName(name, src)
self.dic[name] = src
end
Sensitive = {}
function Sensitive:regSensitiveWords(words)
--这是一个预处理步骤,生成敏感词索引树,功耗大于查找时使用的方法,但只在程序开始时调用一次。
self.treeRoot = TreeNode.new()
self.treeRoot.value = ""
for k,v in pairs(words) do
local word = v
local len = string.len(word)
local currentBranch = self.treeRoot
for i = 1,len do
local char = string.sub(word,i,i)
local tmp = currentBranch:getChild(char)
if tmp then
currentBranch = tmp
else
currentBranch = currentBranch:addChild(char)
end
end
currentBranch.isEnd = true
end
--print(dump(self.treeRoot))
end
--替换字符串中的敏感词返回
function Sensitive:getReplaceWord(len)
local replaceWord = ""
for i = 1,len do
replaceWord = replaceWord.."*"
end
return replaceWord
end
function Sensitive:replaceSensitiveWord(dirtyWords)
local char
local curTree = self.treeRoot
local childTree
local curEndWordTree
local dirtyWord
local c = 1--循环索引
local endIndex = 1--词尾索引
local headIndex = -1--敏感词词首索引
local target = clone(dirtyWords)
while c <= string.len(target) do
char = string.sub(target,c,c)
childTree = curTree:getChild(char)
if childTree then--在树中遍历
if childTree.isEnd then
curEndWordTree = childTree
endIndex = c
end
if headIndex == -1 then
headIndex = c
end
curTree = childTree
c = c + 1
else--跳出树的遍历
if (curEndWordTree) then -- 如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
dirtyWord = curEndWordTree:getFullWord()
local len = string.len(dirtyWord)
dirtyWords = string.gsub(dirtyWords, string.sub(dirtyWords, endIndex - len, endIndex), self:getReplaceWord(string.len(dirtyWord)))
c = endIndex
elseif curTree ~= self.treeRoot then--如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
c = headIndex
headIndex = -1
end
curTree = self.treeRoot
curEndWordTree = nil
c = c + 1
end
end
--循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if curEndWordTree then
local len = string.len(dirtyWord)
dirtyWord = curEndWordTree:getFullWord()
dirtyWords = string.gsub(dirtyWords, string.sub(dirtyWords, endIndex - len, endIndex), self:getReplaceWord(string.len(dirtyWord)))
end
return dirtyWords
end
function Sensitive:containsBadWords(dirtyWords)
local char
local curTree = self.treeRoot
local childTree
local curEndWordTree
local dirtyWord
local c = 1 --循环索引
local endIndex = 1--词尾索引
local headIndex = -1--敏感词词首索引
while c <= string.len(dirtyWords) do
char = string.sub(dirtyWords, c , c)
childTree = curTree:getChild(char)
if childTree then--在树中遍历
if childTree.isEnd then
curEndWordTree = childTree
endIndex = c
end
if headIndex == -1 then
headIndex = c
end
curTree = childTree
c = c + 1
else--跳出树的遍历
if (curEndWordTree) then--如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
dirtyWord = curEndWordTree:getFullWord()
c = endIndex
return true
elseif curTree ~= self.treeRoot then--如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
c = headIndex
headIndex = -1
end
curTree = self.treeRoot
curEndWordTree = nil
c = c + 1
end
end
--循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if curEndWordTree then
dirtyWord = curEndWordTree:getFullWord()
return true
end
return false
end