AC自动机的另类匹配算法

最新推荐文章于 2023-06-29 20:23:34 发布

weixin_30830327

最新推荐文章于 2023-06-29 20:23:34 发布

阅读量126

点赞数

原文链接：http://www.cnblogs.com/ljt12138/p/6684383.html

版权

思路

蒟蒻觉得ac自动机的匹配过程太繁琐，于是决定手制一个好理解的匹配算法。经过一天的艰苦奋斗，终于用我的递归算法用900s的擦边成绩A掉了hdu2222的模板题。(O-O)好开心。。
简要说一下思路，就是将字符串直接在树上匹配，如果无法匹配或字符串已经结束，则用fail指针跳转。不难发现，如果对于所有模式串都互不为子串，那么这样的算法已经可以交代了。证明过程大致如下：

对于当前串P匹配上了字符串S[0]，证明P->fail不需要匹配S。

证明：设Q=PS[0],S′=S[1],T=Pfail，如果T没有孩子S’，显然不存在匹配；如果T有孩子S’，则一定可以归纳为对于Q匹配S’的情况，最终要么如上情况，要么匹配失败而跳转，从而匹配到。

然而还有一个特殊情况，就是两个子串存在前缀包含关系(如abc和ab)，需要一个特判，即当节点为finish但后面还有节点时，在这次把finish和通过fail与这个finish链接的所有finish清算一下。

5.1 补充：
由于递归几乎是线性的，简单的改为非递归，差不多600sAC，见下面代码的match2，完全由递归版改来。

匹配 {
    如果字符串终止，找到所有fail可达且已经finish的节点并计算返回
    count = 0
    如果当前节点finish，count加上finish
    如果存在孩子str[0]，沿着树向下匹配，并加上count和所有fail可达且finish的节点并返回
    如果节点为root，匹配str+1
    // 匹配形如 she --> sheee，否则会死循环
    否则，通过fail跳转计算
}

    int match(Node _nd, const char* _str)
    {
        if (*_str == '\0') {
            if (_nd->finish) {
                int t = _nd->finish; _nd->finish = 0;
                return t+match(_nd->fail, _str);
            }
            if (_nd == root)
                return 0;
            return match(_nd->fail, _str);
        }
        int count = 0;
        if (_nd->finish)
            count = _nd->finish;
        _nd->finish = 0;
        if (_nd->children[*_str-'a'])
            return match(_nd->children[*_str-'a'], _str+1)+count+match(_nd->fail, emp);
        if (_nd == root)
            return match(_nd, _str+1);
        return match(_nd->fail, _str)+count;
    }

代码

多说无益，上我的擦边AC代码：
重点

有重复key，算多个
同一个文章里只能匹配一次某个key

#include <iostream>
#include <cstdio>
#include <queue>
#include <stack>
using namespace std;

struct AcTrieNode {
    typedef AcTrieNode *Node;
    int finish = 0;
    Node children[26];
    Node fail;
    //Node pre;
    //char _this;
    AcTrieNode():finish(0),fail(0)
    {
        for (int i=0; i<26; i++)
            children[i] = 0;
    }
};

class AcAutoMachine {
private:
    typedef AcTrieNode *Node;
    Node root;
    char emp[1];
    void push(Node &nd, const char * _str)
    {
        if (!nd)
            nd = new AcTrieNode;
        if (*_str=='\0') {
            nd->finish++;
            return;
        }
        push(nd->children[(*_str)-'a'], _str+1);
    }
    void dfs(Node tree)
    {
        if (tree) {
            for (int i=0; i<26; i++)
                if (tree->children[i]) {
                    cout << (char)(i+'a');
                dfs(tree->children[i]);
            }

        }
    }
    int match(Node _nd, const char* _str)
    {
        if (*_str == '\0') {
            if (_nd->finish) {
                int t = _nd->finish; _nd->finish = 0;
                return t+match(_nd->fail, _str);
            }
            if (_nd == root)
                return 0;
            return match(_nd->fail, _str);
        }
        int count = 0;
        if (_nd->finish)
            count = _nd->finish;
        _nd->finish = 0;
        if (_nd->children[*_str-'a'])
            return match(_nd->children[*_str-'a'], _str+1)+count+match(_nd->fail, emp);
        // 不可能再算到的
        if (_nd == root)
            return match(_nd, _str+1);
        return match(_nd->fail, _str)+count;
    }

    int match2(const char* _str)
    {
        int ans = 0;
        Node _nd = root;
        while (1) {
            //cout << _str << endl;
            if (*_str == '\0') {
                if (_nd->finish) {*/
                    ans += _nd->finish;
                    _nd->finish = 0;
                    // 2
                }
                if (_nd == root)
                    break;
                _nd = _nd->fail;
                continue;
            }
            int count = 0;
            if (_nd->finish)
                count = _nd->finish;
            _nd->finish = 0;
            // 2
            if (_nd->children[*_str-'a']){
                ans += count;
                Node nd = _nd->fail;
                while (1) {
                    if (nd->finish) {
                        ans += nd->finish;
                        nd->finish = 0;
                    }
                    if (nd == root)
                        break;
                    nd = nd->fail;
                }
                _nd = _nd->children[*_str-'a'];
                ++_str;
                continue;
            }

            // 不可能再算到的
            if (_nd == root) {
                _str++;
                continue;
            }
            ans += count;
            _nd = _nd->fail;
        }
        return ans;
    }

public:
    AcAutoMachine()
    {
        root = new AcTrieNode;
        emp[0] = '\0';
    }
    void push(const char * _str)
    {
        push(root, _str);
    }
    void init()
    {
        queue<Node> q;
        q.push(root);
        while (!q.empty()) {
            Node temp = q.front(), prt;
            q.pop();
            for (int i=0; i<26; i++) {
                if (temp->children[i]) {
                    q.push(temp->children[i]);
                    prt = temp->fail;
                    while (prt){
                        if (prt->children[i]) {
                            prt = prt->children[i];
                            break;
                        }
                        prt = prt->fail;
                    }
                    if (!prt) prt = root;
                    temp->children[i]->fail = prt;
                }
            }
        }
        root->fail = root;
    }
    void dfs()
    {
        dfs(root);
    }
    int match(const char* _str)
    {
        //return match(root, _str);
        return match2(_str);
    }
};

char a[55], b[1000005];
int main()
{
    //freopen("data.txt", "r", stdin);
    int cas;
    cin >> cas;
    while (cas--) {
    AcAutoMachine ac;
    int n;
    scanf("%d\n",&n);
    while (n--) {
        gets(a);
        ac.push(a);
    }
    ac.init();
    gets(b);
    //ac.dfs();
    int ans = 0;
    ans = ac.match(b);
    cout << ans << endl;
    }
    return 0;
}

闲话

AC automachine is not ACcepted automachine (O-O)。代码改了一天，递归版程序由40行降到了20行，思路终于明晰了。。

本来还想分析一下效率，试着用聚合分析，结果分析不通。。看来聚合分析这种东西看着简单做着难，毕竟还是蒟蒻。不过既然（循环版）能600msAC，看来复杂度应该没有问题，只是常数稍大。

the algorithm is linear in the length of the strings plus the length of the searched text plus the number of output matches
wiki，分析过程未知。

转载于:https://www.cnblogs.com/ljt12138/p/6684383.html

weixin_30830327

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
AC自动机的另类匹配算法

思路蒟蒻觉得ac自动机的匹配过程太繁琐，于是决定手制一个好理解的匹配算法。经过一天的艰苦奋斗，终于用我的递归算法用900s的擦边成绩A掉了hdu2222的模板题。(O-O)好开心。。简要说一下思路，就是将字符串直接在树上匹配，如果无法匹配或字符串已经结束，则用fail指针跳转。不难发现，如果对于所有模式串都互不为子串，那么这样的算法已经可以交代了。证明过程大致如下：...
复制链接

扫一扫