【NLP】词典树增删改查，最大后向匹配结合

Du恒之

已于 2022-04-04 18:45:42 修改

阅读量924

点赞数

分类专栏： NLP 文章标签： python

于 2022-04-04 18:37:31 首次发布

本文链接：https://blog.csdn.net/qq_44866428/article/details/123957995

版权

NLP 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

词典树增、删、改、查

class Node(object):
    def __init__(self, value) -> None:
        self._children = {}
        self._value = value

    def _add_child(self, char, value, overwrite=False):
        child = self._children.get(char)
        if child is None:
            child = Node(value)
            self._children[char] = child
        elif overwrite:
            child._value = value
        return child


class Trie(Node):
    def __init__(self) -> None:
        super().__init__(None)

    def __contains__(self, key):
        return self[key] is not None

    def __getitem__(self, key):
        state = self
        for char in key:
            state = state._children.get(char)
            if state is None:
                return None
        return state._value

    def __setitem__(self, key, value):
        state = self
        for i, char in enumerate(key):
            if i < len(key) - 1:
                state = state._add_child(char, None, False)
            else:
                state = state._add_child(char, value, True)


if __name__ == '__main__':
    trie = Trie()
    # 增
    trie['自然'] = 'nature'
    trie['自然人'] = 'human'
    trie['自然语言'] = 'language'
    trie['自语'] = 'talk	to oneself'
    trie['入门'] = 'introduction'
    assert '自然' in trie
    # 删
    trie['自然'] = None
    assert '自然' not in trie
    # 改
    trie['自然语言'] = 'human language'
    assert trie['自然语言'] == 'human language'
    # 查
    assert trie['入门'] == 'introduction'
    print(trie)

词典数与最大后向匹配结合

class Node(object):
    def __init__(self, value) -> None:
        self._children = {}
        self._value = value

    def _add_child(self, char, value, overwrite=False):
        child = self._children.get(char)
        if child is None:
            child = Node(value)
            self._children[char] = child
        elif overwrite:
            child._value = value
        return child


class Trie(Node):
    def __init__(self) -> None:
        super().__init__(None)

    def __contains__(self, key):
        return self[key] is not None

    def __getitem__(self, key):
        state = self
        for char in key:
            state = state._children.get(char)
            if state is None:
                return None
        return state._value

    def __setitem__(self, key, value):
        state = self
        for i, char in enumerate(key):
            if i < len(key) - 1:
                state = state._add_child(char, None, False)
            else:
                state = state._add_child(char, value, True)




def backward_match(dic,text):
    
    i = len(text)-1
    word_list = []
    while i > 0:
        word = text[i]
        for j in range(i,-1,-1):
            long_word = text[j:i+1]
            if long_word in dic and len(long_word)>len(word):
                word = long_word
        word_list.append(word)
        i -= len(word)
        
    return word_list

dic = {"效果","研究","口红","中国","进口","红酒","中国进口","研究生","起源","生命"}

# 1.实例化字典树
trie = Trie()
# 2.添加字典到属性
for word in dic:
    trie[word] = 1
    
text = "研究生命起源"

print(backward_match(dic,text))