常用python算法实现(二)——前缀树TrieTree(包含删除节点)

一.概述(多个关键词-实体检索-查询)

         TrieTree(前缀树),又被称为字典树、单词查找树,是一种比较常见的数据存储结构与算法。

         顾名思义,前缀树便是公共的字符只保存一次的多路树。如你所见,它的基本思想是以时间换空间,时间复杂度为logN,效果还不错。不过,我觉得它应用广泛的另外一个原因是它保存了字符的顺序。

         应用: 字符串检索、查询与排序,前缀与公共前缀等。

        github地址: https://github.com/yongzhuo/Tookit-Sihui/blob/master/tookit_sihui/ml_common/trie_tree/trie_tree.py

二.实现(多个关键词-实体检索-查询)

            实现的是: 从句中中查找并提取关键词或者是实体

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time     :2019/6/27 16:40
# @author   :Mo
# @function :TrieTree of keywords find, 只返回查全的情况, 查找句子中的关键词(例如影视名、人名、关键词、实体等)


import logging
logger = logging


class TrieNode:
    """
        前缀树节点-链表
    """
    def __init__(self):
        self.child = {}


class TrieTree:
    """
        前缀树构建, 新增关键词, 关键词词语查找等
    """
    def __init__(self):
        self.algorithm = "trietree"
        self.root = TrieNode()

    def add_keyword(self, keyword):
        """
            新增一个关键词
        :param keyword: str, 构建的关键词
        :return: None
        """
        node_curr = self.root
        for word in keyword:
            if node_curr.child.get(word) is None:
                node_next = TrieNode()
                node_curr.child[word] = node_next
            node_curr = node_curr.child[word]
        # 每个关键词词后边, 加入end标志位
        if node_curr.child.get('[END]') is None:
            node_next = TrieNode()
            node_curr.child['[END]'] = node_next
        node_curr = node_curr.child['[END]']
        logger.info("add {} success!".format("".join(keyword)))

    def delete_keyword(self, keyword):
        """
            删除一个关键词
        :param keyword: str, 构建的关键词
        :return: None
        """
        node_curr = self.root
        flag = 1
        for word in keyword:
            if node_curr.child.get(word) is not None:
                node_curr = node_curr.child[word]
            else:
                flag = 0
        # 每个关键词词后边, 加入end标志位
        if node_curr.child.get('[END]') is not None and flag == 1:
            node_curr.child.pop('[END]')
        else:
            logger.info("{} is not in trietree, delete keyword faild!".format("".join(keyword)))

    def add_keywords_from_list(self, keywords):
        """
            新增关键词s, 格式为list
        :param keyword: list, 构建的关键词
        :return: None
        """
        for keyword in keywords:
            self.add_keyword(keyword)

    def find_keyword(self, sentence):
        """
            从句子中提取关键词, 可提取多个
        :param sentence: str, 输入的句子
        :return: list, 提取到的关键词
        """
        assert type(sentence) == str
        if not sentence: # 空格字符不取
            return []

        node_curr = self.root # 关键词的头, 每遍历完一遍后需要重新初始化
        index_last = len(sentence)
        keyword_list = []
        keyword = ''
        count = 0
        for word in sentence:
            count += 1
            if node_curr.child.get(word) is None: # 查看有无后缀, 即匹配到一个关键词最后一个字符的时候
                if keyword: # 提取到的关键词(也可能是前面的几位)
                    if node_curr.child.get('[END]') is not None: # 取以end结尾的关键词
                        keyword_list.append(keyword)
                    if self.root.child.get(word) is not None: # 处理连续的关键词情况, 如"第九区流浪地球"
                        keyword = word
                        node_curr = self.root.child[word]
                    else: #
                        keyword = ''
                        node_curr = self.root  # 重新初始化
            else: # 有后缀就加到name里边
                keyword = keyword + word
                node_curr = node_curr.child[word]
                if count == index_last:  # 实体结尾的情况
                    if node_curr.child.get('[END]') is not None:
                        keyword_list.append(keyword)
        return keyword_list

    def match_keyword(self, keyword):
        """
            判断keyword在不在trietree里边
        :param keyword: str, input word
        :return: boolean, True or False
        """
        node = self.root
        for kw in keyword:
            if not node.child.get(kw):
                return False
            node = node.child[kw]
        if not node.child.get('[END]'):
            return False
        return True


def get_trie_tree_class(keywords):
    """
        根据list关键词,初始化trie树
    :param keywords: list, input
    :return: objext, 返回实例化的trie
    """
    trie = TrieTree()
    trie.add_keywords_from_list(keywords)
    return trie


if __name__ == "__main__":
    print("".join("你好呀"))
    # 测试1, class实例
    trie = TrieTree()
    keywords = ['英雄', '人在囧途', '那些年,我们一起追过的女孩', '流浪地球', '华娱',
                '犬夜叉', '火影', '名侦探柯南', '约会大作战', '名作之壁', '动漫',
                '乃木坂46', 'akb48', '飘', '最后的武士', '约会', '英雄2', '日娱',
                '2012', '第九区', '星球大战', '侏罗纪公园', '泰坦尼克号', 'Speed']
    keywords = [list(keyword.strip()) for keyword in keywords]
    trie.add_keywords_from_list(keywords) # 创建树
    keyword = trie.find_keyword('第九区约会, 侏罗纪公园和泰坦尼克号泰坦尼克号')
    print(keyword)
    gg = trie.delete_keyword('英雄')
    gg = trie.delete_keyword('英雄3')

    keyword = trie.match_keyword('英雄')
    keyword2 = trie.match_keyword('英雄2')

    print(keyword)


    # 测试2, get树
    trie_tree = get_trie_tree_class(keywords) # 创建树并返回实例化class
    while True:
        print("sihui请你输入:")
        input_ques = input()
        keywords = trie_tree.find_keyword(input_ques)
        print(keywords)

希望对你有所帮助!
 

您好,以下是Python实现FP-Growth算法的代码,供您参考: ``` # 定义FP节点 class TreeNode: def __init__(self, name, count, parent): self.name = name self.count = count self.parent = parent self.children = {} self.next = None def increase(self, count): self.count += count def display(self, ind=1): print(' '*ind, self.name, ' ', self.count) for child in self.children.values(): child.display(ind+1) # 构建FP def build_FP_tree(dataset, min_support): freq_items = {} for trans in dataset: for item in trans: freq_items[item] = freq_items.get(item, 0) + dataset[trans] freq_items = {k:v for k,v in freq_items.items() if v >= min_support} if len(freq_items) == 0: return None, None for item in freq_items: freq_items[item] = [freq_items[item], None] root = TreeNode('Root', 1, None) for trans, count in dataset.items(): localD = {} for item in trans: if item in freq_items: localD[item] = freq_items[item][0] if len(localD) > 0: ordered_items = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] update_FP_tree(ordered_items, root, freq_items, count) return root, freq_items # 更新节点和链表 def update_FP_tree(items, node, freq_items, count): if items[0] in node.children: node.children[items[0]].increase(count) else: node.children[items[0]] = TreeNode(items[0], count, node) if freq_items[items[0]][1] == None: freq_items[items[0]][1] = node.children[items[0]] else: update_links(freq_items[items[0]][1], node.children[items[0]]) if len(items) > 1: update_FP_tree(items[1:], node.children[items[0]], freq_items, count) # 更新连接节点 def update_links(node, target_node): while (node.next != None): node = node.next node.next = target_node # 生成频繁项集的条件模式基 def find_prefix_path(node): cond_pats = {} while (node != None): prefix = [] ascend_FP_tree(node, prefix) if len(prefix) > 1: cond_pats[tuple(prefix[1:])] = node.count node = node.next return cond_pats # 回溯FP,生成前缀 def ascend_FP_tree(node, prefix): if node.parent != None: prefix.append(node.name) ascend_FP_tree(node.parent, prefix) # 递归查找频繁项集 def mine_FP_tree(freq_items, header_table, min_support, prefix, frequentItemList): items = [v[0] for v in sorted(header_table.items(), key=lambda p:p[1])] for item in items: new_freq_set = prefix.copy() new_freq_set.add(item) support = header_table[item][0] frequentItemList.append((new_freq_set, support)) cond_patt_bases = find_prefix_path(header_table[item][1]) cond_tree, cond_items = build_FP_tree(cond_patt_bases, min_support) if cond_tree != None: mine_FP_tree(cond_items, cond_tree, min_support, new_freq_set, frequentItemList) # 封装函数 def FP_growth(dataset, min_support): freq_items = {} for trans in dataset: for item in trans: freq_items[item] = freq_items.get(item, 0) + dataset[trans] freq_items = {k:v for k,v in freq_items.items() if v >= min_support} if len(freq_items) == 0: return None for k in freq_items: freq_items[k] = [freq_items[k], None] header_table = {} for k in freq_items: header_table[k] = [freq_items[k][0], None] for tran, count in dataset.items(): localD = {} for item in tran: if item in freq_items: localD[item] = freq_items[item][0] if len(localD) > 0: ordered_items = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] update_FP_tree(ordered_items, root, freq_items, count) frequentItemList = [] mine_FP_tree(freq_items, header_table, min_support, set(), frequentItemList) return frequentItemList ``` 注意:本算法需要输入数据集和最小支持度,返回频繁项集列表
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值