刷LeetCode会遇到字典树这道题,但是还不知道有这么多的应用
文本识别相关词
其实就是匹配词表,找到包含的最长词,我在最后附一个样例代码
分词
读苏神【中文分词系列】 1. 基于AC自动机的快速分词
发现字典树还用于分词,与AC自动机一起,AC自动机主要是KMP算法,python的实现库为pyahocorasick
使用方式就是add + make,可以点这里简单看下ahocorasick使用
苏神应用AC自动机主要做的分词,最大匹配法、最大概率组合
这里贴个最大概率组合的代码,更多直接看原博客
def max_proba_cut(sentence):
"""动态规划的思想,保留到某个点的最大path"""
paths = {0: ([], 0)}
end = 0
# end_index,(str,prob) py2 j[0]为分词结果
for i, j in dic.iter(sentence):
start, end = 1+i-len(j[0]), i+1
# 以start为终点的path不存在
if start not in paths:
last = max([i for i in paths if i < start])
# 添加以start为终点的结果集
paths[start] = (paths[last][0]+[sentence[last:start]], paths[last][1]-10)
proba = paths[start][1]+j[1]
if end not in paths or proba > paths[end][1]:
paths[end] = (paths[start][0]+[j[0]], proba)
if end < len(sentence):
return paths[end][0] + [sentence[end:]]
else:
return paths[end][0]
字典树匹配词表
class Node(object):
def __init__(self, is_end=False):
self.is_end = False
self.node = {}
class Trie(object):
def __init__(self):
self.size = 0
self.root = Node()
def add(self, word):
"""
字典树添加word
:param word:
:return:
"""
cur = self.root # node
for w in word:
if cur.node.get(w) is None:
cur.node[w] = Node()
cur = cur.node[w]
if not cur.is_end:
cur.is_end = True
self.size += 1
def __len__(self):
return self.size
def __contains__(self, word):
cur = self.root
for w in word:
if cur.node.get(w) is None:
return False
cur = cur.node[w]
return cur.is_end
def search1(self, title):
"""
从一个字符串查找匹配到的产品词
:param title:
:return:
"""
res = []
cur = self.root
n = len(title)
for i, w in enumerate(title):
if w in cur.node:
tmp = cur.node[w]
j = i + 1
while not tmp.is_end and j < n:
if title[j] in tmp.node:
tmp = tmp.node[title[j]]
j += 1
else:
break
if tmp.is_end:
# 沙发床 椰子汁
if j < n and title[j] in tmp.node and tmp.node[title[j]].is_end:
res.append(title[i:j+1])
else:
res.append(title[i:j])
return list(set(res)) if res else res
def search2(self, title):
"""
从一个字符串查找匹配到的最长产品词
:param title:
:return:
"""
res = []
cur = self.root
n = len(title)
for i, w in enumerate(title):
if w in cur.node:
tmp = cur.node[w]
j = i + 1
while j < n:
if title[j] in tmp.node:
tmp = tmp.node[title[j]]
j += 1
else:
break
if tmp.is_end:
res.append(title[i:j])
return list(set(res)) if res else res