今天刷到了字典树的知识,看了一篇文章感觉总结的挺好,链接放在下方,只不过文中的代码用的java,因为平时自己刷题用的都是python,所以打算将文章中的代码都同步为python语言,方便和我一样用python的小伙伴学习参考~
构造字典树
经典的字典树(只包含26个小写字母)
208. 实现 Trie (前缀树)
class TrieNode:
def __init__(self):
# 节点
self.isWord = False # 表示当前节点是否是一个单词的结尾
self.children = [None] * 26 # 26个子节点,分别对应小写字母a到z
class Trie:
def __init__(self):
self.root = TrieNode() # 根节点
def insert(self, word):
cur = self.root # 先指向根节点
for char in word:
# 如果是【后缀树】而不是【前缀树】,把单词倒着插就可以了,即:for char in range(len(word) - 1, -1, -1):
index = ord(char) - ord('a') # (关键) 将一个字符用数字表示出来,并作为下标
if not cur.children[index]:
cur.children[index] = TrieNode() # 新建节点
cur = cur.children[index] # 指向下一个节点
cur.isWord = True # 一个单词插入完毕,此时 cur 指向的节点即为一个单词的结尾
def search(self, word):
cur = self.root # 从根节点开始
for char in word:
index = ord(char) - ord('a') # (关键) 将一个字符用数字表示出来,并作为下标
if not cur.children[index]:
return False # 如果遇到null,说明这个word不是前缀树的任何一条路径,返回false
cur = cur.children[index] # 指向下一个节点
return cur.isWord # 返回当前节点是否为单词的结尾
def startsWith(self, prefix):
cur = self.root # 从根节点开始
for char in prefix:
index = ord(char) - ord('a') # (关键) 将一个字符用数字表示出来,并作为下标
if not cur.children[index]:
return False # 如果遇到null,说明这个word不是前缀树的任何一条路径,返回false
cur = cur.children[index] # 指向下一个节点
return True # 安全走完,返回true就行了———我们并不关心此时cur是不是末尾(isWord)
例题——变式题目
变式1:利用字典树的构造过程——忽略后缀单词
820. 单词的压缩编码
class TrieNode:
def __init__(self):
self.isWord = False
self.children = [None] * 26
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
cur = self.root
is_new = False
for i in range(len(word) - 1, -1, -1):
c = ord(word[i]) - ord('a')
if not cur.children[c]:
cur.children[c] = TrieNode()
is_new = True
cur = cur.children[c]
cur.isWord = True
return len(word) + 1 if is_new else 0
class Solution:
def minimumLengthEncoding(self, words: List[str]) -> int:
# 【字典树】——— 之所以想到使用字典树,是因为该题完全发挥了字符串的后缀特征
# 我们构造出这样的一个[逆序]字典树,很容易发现: "编码"后的字符串长度,就是忽略了后缀单词后,所有单词的(长度+1)之和
# 这不难理解,比如"abcd#","bcd","cd","d"这种后缀单词就默认被包括了,因而算整个字符串的长度时,算"abcd"这个最长的就行了
# 核心思路是:每次往字典树插入一个"新的word"时[对应代码中的is_new],就 += 该word的长度 + 1(#)
# 需要注意的是,不是每一次插入单词,都需要加上该单词的长度
# 而是先根据长度对words进行一次排序,先插入长的,再插入短的。如果插入时需要new出新节点,我们就认为这是一个"新word"
res_len = 0
words.sort(key = lambda s: len(s), reverse = True) # 从大到小排序
trie = Trie()
for word in words:
res_len += trie.insert(word)
return res_len
附评论中的简洁做法
class Solution:
def minimumLengthEncoding(self, words: List[str]) -> int:
words = sorted(words, key = lambda i: len(i), reverse=True)
s = ""
for i in words:
if i in s and i+"#" in s:
continue
s += i+"#"
return len(s)
变式2:利用字典树充分利用前缀(后缀)性质,优化暴力算法
面试题 17.13. 恢复空格
class TrieNode:
def __init__(self):
self.isWord = False
self.children = [None] * 26
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
cur = self.root
for i in range(len(word) - 1, -1, -1):
c = ord(word[i]) - ord('a')
if not cur.children[c]:
cur.children[c] = TrieNode()
cur = cur.children[c]
cur.isWord = True
def search(self, sentence, end):
res_list = []
cur = self.root
for i in range(end, -1, -1):
c = ord(sentence[i]) - ord('a')
if not cur.children[c]:
break
cur = cur.children[c]
if cur.isWord:
res_list.append(i)
return res_list
class Solution:
def respace(self, dictionary, sentence):
len_sentence = len(sentence)
dp = [0] * (len_sentence + 1)
trie = Trie()
for word in dictionary:
trie.insert(word)
for i in range(1, len_sentence + 1):
dp[i] = dp[i - 1] + 1
for j in trie.search(sentence, i - 1):
dp[i] = min(dp[i], dp[j])
return dp[len_sentence]
# 测试
solution = Solution()
dictionary = ["looked", "just", "like", "her", "brother"]
sentence = "jesslookedjustliketimherbrother"
result = solution.respace(dictionary, sentence)
print(result)
附原始动态规划代码
class Solution:
def respace(self, dictionary, sentence):
word_set = set(dictionary)
n = len(sentence)
# dp[i] 表示字符串的前 i 个字符的最少未匹配数
dp = [0] * (n + 1)
for i in range(1, n + 1):
dp[i] = dp[i - 1] + 1
for idx in range(i):
if sentence[idx:i] in word_set: # 此时是求sentence中的下标,所以不包含i的时候才是正确的长度--->对应dp含义
dp[i] = min(dp[i], dp[idx])
return dp[n]
变式3:search方法的变异——match递归
- 经典的search方法,是通过一个cur指针(引用),根据word的字符,一条路走下去
- 其实,它还有一个思路———每次判断一个节点是否配对 的【递归】写法 :
class TrieNode:
def __init__(self):
self.isWord = False
self.children = [None] * 26
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
cur = self.root
for i in range(len(word) - 1, -1, -1):
c = ord(word[i]) - ord('a')
if cur.children[c] is None:
cur.children[c] = TrieNode()
cur = cur.children[c]
cur.isWord = True
def search(self, word):
return self.match(word, self.root, 0)
# macth方法基本思路是:根据word和start得到此时的字符,
# 然后看该字符是否与此时的节点node配对————即node.children[c]有值(!=null)
# (其实start就相当于非递归写法中的 for i 的i,用来遍历word
def match(self, word, node, start): # 这个三个参数直接背下来,这是模板参数
if start == len(word):
return node.isWord
c = ord(word[start]) - ord('a')
return node.children[c] is not None and self.match(word, node.children[c], start + 1)
Q:我知道match递归写法很妙,但有什么用呢?cur一条路走到黑的思路不是更好理解吗?
A:恰恰是因为“cur一条路走到黑”的思路有弊端——有时我们需要走一个分叉的路,去尝试更多的可能。
通过下面的两道变式题目,就能理解递归型search的强大之处
变式4:含有通配符的字典树匹配——递归的search
211. 添加与搜索单词 - 数据结构设计
【笔记】该题与
208. 实现 Trie (前缀树)
大同小异,只需要对.
单独处理就可以了。其他代码复用,只需要重写search
辅助函数,当出现.
时,需要遍历res->children_[26]
,该过程深入几层后,可能存在不满足情况,此时需要回溯,所以最好采用递归。(来自评论)
class TrieNode:
def __init__(self):
self.isWord = False
self.children = [None] * 26
class WordDictionary:
def __init__(self):
self.root = TrieNode()
def addWord(self, word):
cur = self.root
for i in range(len(word)):
c = ord(word[i]) - ord('a')
if cur.children[c] is None:
cur.children[c] = TrieNode()
cur = cur.children[c]
cur.isWord = True
def search(self, word):
return self.match(word, self.root, 0)
def match(self, word, node, start):
if start == len(word):
return node.isWord
if word[start] != '.':
c = ord(word[start]) - ord('a')
return node.children[c] is not None and self.match(word, node.children[c], start + 1)
else:
for i in range(26):
if node.children[i] is not None and self.match(word, node.children[i], start + 1):
return True
return False
变式5:允许且必须变化一个字符后再匹配——递归的search
676. 实现一个魔法字典
>>> 你会遇到一个棘手的问题,就是当字典树中有"hello"和"hallo"时,search("hello")会返回false。
问题的关键在于:一般我们写search,都是根据word先算出下标————这会导致,字典树从hello这条路,一路走到头,因为没有修改任何一个字母导致返回false。
因此,千万要抛弃这个字典树的search模板,改为一次for(26)的遍历。>>> 逻辑是:
发现这个字母可行后,再去看这个"可行的字母"是不是就是"word.charAt(start)"
而不是根据"word.charAt(start)",看这个字母是否"可行" (可行的意思是,这是字典树的一个合法节点)理解上面这句话,是解决第一行那个问题的关键。
class TrieNode:
def __init__(self):
self.isWord = False
self.children = [None] * 26
class MagicDictionary:
def __init__(self):
self.root = TrieNode()
def buildDict(self, dictionary: List[str]) -> None:
for word in dictionary:
cur = self.root
for ch in word:
num = ord(ch) - ord('a')
if cur.children[num] is None:
cur.children[num] = TrieNode()
cur = cur.children[num]
cur.isWord = True
def search(self, word):
return self.match(word, self.root, 0, False)
def match(self, word, node, start, has_chance):
if start == len(word):
return node.isWord and has_chance # 因为"必须变一个字符",因此 "and not has_chance"
for i in range(26):
if node.children[i] is not None:
if ord(word[start]) - ord('a') == i and self.match(word, node.children[i], start + 1, has_chance):
return True
if ord(word[start]) - ord('a') != i and not has_chance and self.match(word, node.children[i], start + 1, True):
return True
return False
自己加的题
648. 单词替换
做法一:哈希集合
首先将 dictionary中所有词根放入哈希集合中,然后对于 sentence中的每个单词,由短至长遍历它所有的前缀,如果这个前缀出现在哈希集合中,则我们找到了当前单词的最短词根,将这个词根替换原来的单词。最后返回重新拼接的句子。
class Solution:
def replaceWords(self, dictionary: List[str], sentence: str) -> str:
dictionarySet = set(dictionary)
words = sentence.split(' ')
for i, word in enumerate(words):
for j in range(1, len(words) + 1):
if word[:j] in dictionarySet:
words[i] = word[:j]
break
return ' '.join(words)
作者:力扣官方题解
链接:https://leetcode.cn/problems/replace-words/solutions/1649109/dan-ci-ti-huan-by-leetcode-solution-pl6v/
来源:力扣(LeetCode)
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
做法二:字典树
与哈希集合不同,我们用 dictionary中所有词根构建一棵字典树,并用特殊符号标记结尾。在搜索前缀时,只需在字典树上搜索出一条最短的前缀路径即可。
class Solution:
def replaceWords(self, dictionary: List[str], sentence: str) -> str:
trie = {}
for word in dictionary:
cur = trie
for c in word:
if c not in cur:
cur[c] = {}
cur = cur[c]
cur['#'] = {}
words = sentence.split(' ')
for i, word in enumerate(words):
cur = trie
for j, c in enumerate(word):
if '#' in cur:
words[i] = word[:j]
break
if c not in cur:
break
cur = cur[c]
return ' '.join(words)
作者:力扣官方题解
链接:https://leetcode.cn/problems/replace-words/solutions/1649109/dan-ci-ti-huan-by-leetcode-solution-pl6v/
来源:力扣(LeetCode)
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
class Trie:
def __init__(self):
self.children = dict()
self.isEnd = False
def insert(self, word: str) -> None:
cur = self
for ch in word:
if ch not in cur.children:
cur.children[ch] = Trie()
cur = cur.children[ch]
cur.isEnd = True
def search(self, word: str) -> str:
cur = self
index = 0
for ch in word:
if ch not in cur.children:
return word
cur = cur.children[ch]
index += 1
if cur.isEnd:
break
return word[:index]
class Solution:
def replaceWords(self, dictionary: List[str], sentence: str) -> str:
trie_tree = Trie()
for word in dictionary:
trie_tree.insert(word)
words = sentence.split(" ")
size = len(words)
for i in range(size):
word = words[i]
words[i] = trie_tree.search(word)
return ' '.join(words)
# 哈希集合
class Solution:
def replaceWords(self, dictionary: List[str], sentence: str) -> str:
dictionarySet = set(dictionary)
words = sentence.split(' ')
for i, word in enurmate(words):
for j in range(1, len(words) + 1):
if word[:j] in dictionarySet:
words[i] = word[:j]
break
return ' '.join(words)
class Solution:
def replaceWords(self, dict: List[str], sentence: str) -> str:
dict.sort()
s = sentence.split(' ')
for i, word in enumerate(s):
for j in dict:
if word.startswith(j):
s[i] = j
break
return ' '.join(s)
# 字典树
class TrieNode:
def __init__(self):
self.children = [None] * 26
self.isEnd = False
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
curr = self.root
for ch in word:
idx = ord(ch) - ord('a')
if not curr.children[idx]:
curr.children[idx] = TrieNode()
curr = curr.children[idx]
curr.isEnd = True
def get_prefix(self, word):
curr = self.root
for i in range(len(word)):
if curr.isEnd:
return word[:i]
idx = ord(word[i]) - ord('a')
if not curr.children[idx]:
break
curr = curr.children[idx]
return word
class Solution:
def replace_words(self, dictionary, sentence):
tr = Trie()
for prefix in dictionary:
tr.insert(prefix)
words = sentence.split(" ")
for i in range(len(words)):
words[i] = tr.get_prefix(words[i])
return " ".join(words)
677. 键值映射
class Trie:
def __init__(self):
"""
Initialize your data structure here.
"""
self.children = dict()
self.isEnd = False
self.value = 0
def insert(self, word: str, value: int) -> None:
"""
Inserts a word into the trie.
"""
cur = self
for ch in word:
if ch not in cur.children:
cur.children[ch] = Trie()
cur = cur.children[ch]
cur.isEnd = True
cur.value = value
def search(self, word: str) -> int:
"""
Returns if the word is in the trie.
"""
cur = self
for ch in word:
if ch not in cur.children:
return 0
cur = cur.children[ch]
return self.dfs(cur)
def dfs(self, root) -> int:
if not root:
return 0
res = root.value
for node in root.children.values():
res += self.dfs(node)
return res
class MapSum:
def __init__(self):
"""
Initialize your data structure here.
"""
self.trie_tree = Trie()
def insert(self, key: str, val: int) -> None:
self.trie_tree.insert(key, val)
def sum(self, prefix: str) -> int:
return self.trie_tree.search(prefix)
# 暴力
class MapSum:
def __init__(self):
self.map = {}
def insert(self, key: str, val: int) -> None:
self.map[key] = val
def sum(self, prefix: str) -> int:
res = 0
for key, val in self.map.items():
if key.startswith(prefix):
res += val
return res
# 前缀哈希映射
class MapSum:
def __init__(self):
self.map = {}
self.prefixmap = {}
def insert(self, key: str, val: int) -> None:
delta = val
if key in self.map:
delta -= self.map[key]
self.map[key] = val
for i in range(len(key)):
currprefix = key[0:i+1]
self.prefixmap[currprefix] = self.prefixmap.get(currprefix, 0) + delta
def sum(self, prefix: str) -> int:
if prefix in self.prefixmap:
return self.prefixmap[prefix]
else:
return 0
# 字典树[仿照前缀哈希映射]
class TrieNode:
def __init__(self):
self.val = 0
self.next = [None] * 26
class MapSum:
def __init__(self):
self.root = TrieNode()
self.map = {}
def insert(self, key: str, val: int) -> None:
delta = val
if key in self.map:
delta -= self.map[key]
self.map[key] = val
node = self.root
for c in key:
idx = ord(c) - ord('a')
if node.next[idx] is None:
node.next[idx] = TrieNode()
node = node.next[idx]
node.val += delta
def sum(self, prefix: str) -> int:
node = self.root
for c in prefix:
idx = ord(c) - ord('a')
if node.next[idx] is None:
return 0
node = node.next[idx]
return node.val