示意图:
import collections
# Tire结点
class TireNode(object):
def __init__(self):
self.children = collections.defaultdict(TireNode)
self.is_word = False
# Tire Tree
class Tire(object):
def __init__(self):
self.root = TireNode()
def insert(self,word):
current = self.root
for char in word:
# defaultdict的索引操作:
# 1.key在defaultdict时:return current.children[key]
# 2.key不在defaultdict时,分为两步:
# (1) current.children[key] = TireNode()
# (2) return current.children[key]
current = current.children[char]
#标记位变为True,表明从根节点到这里是一个词
current.is_word = True
def search(self,word):
current = self.root
for char in word:
current = current.children.get(char)
# 词典里没有该前缀的词语
if not current:
return False
# 判断是否是一个合法词语
return current.is_word
def startsWith(self,word):
current = self.root
for char in word:
current = current.children.get(char)
# 词典里没有该前缀的词语
if not current:
return False
return True
# 匹配过程:
# 南京长江大桥
# 南京长江大
# ...
# 南
def enumerateMatch(self,sentence,space = '_'):
current = self.root
sentence = list(sentence)
matched = []
# 依次从后向前匹配
while len(sentence) - 1:
if self.search(sentence):
matched.append('_'.join(sentence))
del sentence[-1]
return matched
t = Tire()
gazetteer = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥']
for gaz in gazetteer:
t.insert(gaz)
# 测试insert与search:
matchs = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥','江大桥','小黑']
print('###测试insert与search:\n')
for match in matchs:
print(match,':',t.search(match))
words = ['南', '京', '市', '长', '江', '大', '桥']
print('###测试enumerateMatch:\n')
print(t.enumerateMatch(words))
print('###应用Tire Tree,找到每一个char的对应的word:\n')
words = ['南', '京', '市', '长', '江', '大', '桥']
matched = []
for _ in range(len(words)):
matched.append([])
for index in range(len(words)):
current_word = words[index:]
print(index,''.join(current_word),end = ':')
result = t.enumerateMatch(current_word)
for res in result:
matched[index + len(res.split('_')) - 1].append(res)
print(' ',result)
print('matched:',matched)
输出:
###测试insert与search:
南京 : True
南京市 : True
市长 : True
长江 : True
长江大桥 : True
大桥 : True
江大桥 : False
小黑 : False
###测试enumerateMatch:
[‘南_京_市’, ‘南_京’]
###应用Tire Tree,找到每一个char的对应的word:
0 南京市长江大桥: [‘南_京_市’, ‘南_京’]
1 京市长江大桥: []
2 市长江大桥: [‘市_长’]
3 长江大桥: [‘长_江_大_桥’, ‘长_江’]
4 江大桥: []
5 大桥: [‘大_桥’]
6 桥: []
matched: [[], [‘南_京’], [‘南_京_市’], [‘市_长’], [‘长_江’], [], [‘长_江_大_桥’, ‘大_桥’]]