# -*- encoding:utf-8 -*
__all__ =['Ahocorasick',]classNode(object):def__init__(self):
self.next={}
self.fail =None
self.isWord =FalseclassAhocorasick(object):def__init__(self):
self.__root = Node()defaddWord(self, word):'''
@param word: add word to Tire tree
添加关键词到Tire树中
'''
tmp = self.__root
for i inrange(0,len(word)):if word[i]notin tmp.next:
tmp.next[word[i]]= Node()
tmp = tmp.next[word[i]]
tmp.isWord =Truedefmake(self):'''
build the fail function
构建自动机,失效函数
'''
tmpQueue =[]
tmpQueue.append(self.__root)while(len(tmpQueue)>0):
temp = tmpQueue.pop()
p =Nonefor k, v in temp.next.items():if temp == self.__root:
temp.next[k].fail = self.__root
else:
p = temp.fail
while p isnotNone:if k in p.next:
temp.next[k].fail = p.next[k]break
p = p.fail
if p isNone:
temp.next[k].fail = self.__root
tmpQueue.append(temp.next[k])defsearch(self, content):'''
@return: a list of tuple,the tuple contain the match start and end index
'''
p = self.__root
result =[]
startWordIndex =0
endWordIndex =-1
currentPosition =0while currentPosition <len(content):
word = content[currentPosition]# 检索状态机,直到匹配while word notin p.nextand p != self.__root:
p = p.fail
if word in p.next:if p == self.__root:# 若当前节点是根且存在转移状态,则说明是匹配词的开头,记录词的起始位置
startWordIndex = currentPosition
# 转移状态机的状态
p = p.next[word]else:
p = self.__root
if p.isWord:# 若状态为词的结尾,则把词放进结果集
result.append((content[startWordIndex:currentPosition+1],startWordIndex, currentPosition))
currentPosition +=1return result