AC自动机-Python实现


参考链接: https://www.cnblogs.com/nullzx/p/7499397.html

1、Trie前缀树

用途:字符串排序统计,效率优于哈希算法

本质:N叉树

class TrieNode():
    def __init__(self):
        self.data = {}
        self.is_word = False

class Trie:
    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.root = TrieNode()

    def insert(self, word):
        """
        Inserts a word into the trie.
        :type word: str
        :rtype: void
        """
        node = self.root
        for chars in word:
            child = node.data.get(chars)
            if not child:
                node.data[chars] = TrieNode()
            node = node.data[chars]
        node.is_word = True

    def search(self, word):
        """
        Returns if the word is in the trie.
        :type word: str
        :rtype: bool
        """
        node = self.root
        for chars in word:
            node = node.data.get(chars)
            if not node:
                return False
        return node.is_word  # 判断单词是否是完整的存在在trie树中

    def startsWith(self, prefix):
        """
        Returns if there is any word in the trie that starts with the given prefix.
        :type prefix: str
        :rtype: bool
        """
        node = self.root
        for chars in prefix:
            node = node.data.get(chars)
            if not node:
                return False
        return True

    def get_start(self, prefix):
        """
          Returns words started with prefix
          :param prefix:
          :return: words (list)
        """

        def get_key(pre, pre_node):
            word_list = []
            if pre_node.is_word:
                word_list.append(pre)
            for x in pre_node.data.keys():
                word_list.extend(get_key(pre + str(x), pre_node.data.get(x)))
            return word_list

        words = []
        if not self.startsWith(prefix):
            return words
        if self.search(prefix):
            words.append(prefix)
            return words
        node = self.root
        for chars in prefix:
            node = node.data.get(chars)
        return get_key(prefix, node)


trie = Trie()
trie.insert("something")
trie.insert("somebody")
trie.insert("somebody1")
trie.insert("somebody3")
print(trie.search("key"))
print(trie.search("somebody3"))
print(trie.get_start('somebody'))

2. AC自动机

每个节点带有fail指针,同KMP算法一样,指向与自己具有最大公共前缀串的位置。通过动态规划思想,若孩子节点在父节点fail指向的孩子节点里,子节点fail指针为父节点fail指针的子节点,否则指向fail节点的fail节点,直到无法fail指向根节点。

class TrieNode():
    def __init__(self):
        self.child = {}
        self.failto = None
        self.is_word = False
        '''
        下面节点值可以根据具体场景进行赋值
        '''
        self.str_ = ''
        self.num = 0

class AhoCorasickAutomation:
    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.root = TrieNode()

    def buildTrieTree(self, wordlst):
        for word in wordlst:
            cur = self.root
            for i, c in enumerate(word):
                if c not in cur.child:
                    cur.child[c] = TrieNode()
                ps = cur.str_
                cur = cur.child[c]
                cur.str_ = ps + c
            cur.is_word = True
            cur.num += 1

    def build_AC_from_Trie(self):
        queue = []
        for child in self.root.child:
            self.root.child[child].failto = self.root
            queue.append(self.root.child[child])

        while len(queue) > 0:
            cur = queue.pop(0)
            for child in cur.child.keys():
                failto = cur.failto
                while True:
                    if failto == None:
                        cur.child[child].failto = self.root
                        break
                    if child in failto.child:
                        cur.child[child].failto = failto.child[child]
                        break
                    else:
                        failto = failto.failto
                queue.append(cur.child[child])

    def ac_search(self, str_):
        cur = self.root
        result = {}
        i = 0
        n = len(str_)
        while i < n:
            c = str_[i]
            if c in cur.child:
                cur = cur.child[c]
                if cur.is_word:
                    temp = cur.str_
                    result.setdefault(temp, [])
                    result[temp].append([i - len(temp) + 1, i, cur.num])

                '''
                处理所有其他长度公共字串
                '''
                fl = cur.failto
                while fl:
                    if fl.is_word:
                        temp = fl.str_
                        result.setdefault(temp, [])
                        result[temp].append([i - len(temp) + 1, i, cur.failto.num])
                    fl = fl.failto
                i += 1

            else:
                cur = cur.failto
                if cur == None:
                    cur = self.root
                    i += 1


        return result


acTree = AhoCorasickAutomation()

acTree.buildTrieTree(['abcdef', 'abhab', 'bcd', 'cde', 'cd', 'cdfkcdf', 'cde'])
acTree.build_AC_from_Trie()
res = acTree.ac_search('bcabcdebcedfabcdefababkabhabkcdbcde')
print(res)

import ahocorasick

ac = ahocorasick.Automaton()
wordlst = ['abcdef', 'abhab', 'bcd', 'cd', 'cde', 'cdfkcdf', 'cde']
for word in wordlst:
    ac.add_word(word, [word])

ac.make_automaton()

for item in ac.iter('bcabcdebcedfabcdefababkabhabkcdbcde'):
    print(item)

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值