AC自动机

AC自动机

AC自动机(Aho-Corasick Automaton)作为一种KMP的扩展算法,可以在多个模式串的条件下完成对文本串的匹配。

基本原理

AC自动机的底层数据类型是Tire字典树,底层算法为KMP匹配算法,故阅读此文需要Tire树和KMP算法作为前置知识学习。

AC自动机的核心本质还是自动机,主要构建一个状态转移函数 P [ u ] [ v ] P[u][v] P[u][v],表示状态 u u u读取到字符 v v v的下一个状态。

状态转移
例如,当我们发现G失配之后,我们会进行状态转移,放弃当前模式串匹配,转移到另外一个模式串中。

创建Tire树

AC自动机的状态转移函数是以Tire树为基础创建的,因此在创建AC自动机之前,我们先要创建一个Tire树,很容易,和一般的Tire树创建一样。

struct ACAutomaton
{
    int tree[10000005][26];
    int tot = 0;
    void insert(string &s)
    {
        int curr = 0;
        for (int i = 0; i < s.size(); i++)
        {
            if (tree[curr][s[i] - 'a'] == 0)
            {
                tree[curr][s[i] - 'a'] = ++tot;
            }
            curr = tree[curr][s[i] - 'a'];
        }
    }
};

计算状态转移函数和fail指针和匹配

这一个步,我们需要计算出完整的状态转移函数 t r e e tree tree和失配指针数组 f a i l fail fail,其中:

  • t r e e [ u ] [ v ] tree[u][v] tree[u][v]代表状态 u u u下一个读取字符 v v v之后,所转移到的状态。
  • f a i l [ u ] fail[u] fail[u]代表是 u u u的一个后缀的最长字符串的节点。

那么我们就可以写出代码:

struct ACAutomaton
{
    int tree[10000005][26];
    int fail[10000005];
    int ed[10000005];
    int tot = 0;
    void insert(string &s)
    {
        int curr = 0;
        for (int i = 0; i < s.size(); i++)
        {
            if (tree[curr][s[i] - 'a'] == 0)
            {
                tree[curr][s[i] - 'a'] = ++tot;
            }
            curr = tree[curr][s[i] - 'a'];
        }
        ed[curr]++;
    }

    void buildAC()
    {
        queue<int> que;
        for (int i = 0; i < 26; i++)
            if (tree[0][i] != 0)
                que.push(tree[0][i]);
        while (!que.empty())
        {
            int curr = que.front();
            que.pop();
            for (int i = 0; i < 26; i++)
            {
                if (tree[curr][i] == 0)
                {
                    // 后缀转移
                    tree[curr][i] = tree[fail[curr]][i];
                }
                else
                {
                    // 失配指针转移
                    fail[tree[curr][i]] = tree[fail[curr]][i];
                    que.push(tree[curr][i]);
                }
            }
        }
    }

    int query(string &txt)
    {
        int ans = 0;
        int status = 0;
        for (int i = 0; i < txt.size(); i++)
        {
            status = tree[status][txt[i] - 'a'];
            for (int j = status; j != 0 && ed[j] != -1; j = fail[j])
            {
                ans += ed[j];
                ed[j] = -1;
            }
        }
        return ans;
    }
} ac;

例题

P3808

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

#define FR freopen("in.txt", "r", stdin)
#define FW freopen("out1.txt", "w", stdout)

struct ACAutomaton
{
    int tree[10000005][26];
    int fail[10000005];
    int ed[10000005];
    int tot = 0;
    void insert(string &s)
    {
        int curr = 0;
        for (int i = 0; i < s.size(); i++)
        {
            if (tree[curr][s[i] - 'a'] == 0)
            {
                tree[curr][s[i] - 'a'] = ++tot;
            }
            curr = tree[curr][s[i] - 'a'];
        }
        ed[curr]++;
    }

    void buildAC()
    {
        queue<int> que;
        for (int i = 0; i < 26; i++)
            if (tree[0][i] != 0)
                que.push(tree[0][i]);
        while (!que.empty())
        {
            int curr = que.front();
            que.pop();
            for (int i = 0; i < 26; i++)
            {
                if (tree[curr][i] == 0)
                {
                    // 后缀转移
                    tree[curr][i] = tree[fail[curr]][i];
                }
                else
                {
                    // 失配指针转移
                    fail[tree[curr][i]] = tree[fail[curr]][i];
                    que.push(tree[curr][i]);
                }
            }
        }
    }

    int query(string &txt)
    {
        int ans = 0;
        int status = 0;
        for (int i = 0; i < txt.size(); i++)
        {
            status = tree[status][txt[i] - 'a'];
            for (int j = status; j != 0 && ed[j] != -1; j = fail[j])
            {
                ans += ed[j];
                ed[j] = -1;
            }
        }
        return ans;
    }
} ac;

int main()
{
    int n;
    cin >> n;
    for (int i = 0; i < n; i++)
    {
        string s;
        cin >> s;
        ac.insert(s);
    }
    ac.buildAC();
    string txt;
    cin >> txt;
    cout << ac.query(txt);
    return 0;
}

NC20443

我们发现每次统计的时候 f a i l fail fail指针会跳转很多次,我们可以先记录下来,然后通过拓扑排序一次性统计。

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

#define FR freopen("in.txt", "r", stdin)
#define FW freopen("out1.txt", "w", stdout)

char str[210][1000005];

struct AC
{
    int tree[3000000][26];
    int fail[3000000];
    int ed[3000000];
    int mem[3000000];
    bool vis[3000000];
    int cnt[205];
    int mp[205];
    int tot = 0;
    stack<int> sta;

    void insert(char s[], int i)
    {
        int curr = 0;
        for (int i = 0; s[i] != '\0'; i++)
        {
            if (tree[curr][s[i] - 'a'] == 0)
            {
                tree[curr][s[i] - 'a'] = ++tot;
            }

            curr = tree[curr][s[i] - 'a'];
        }
        if (ed[curr] != 0)
        {
            mp[i] = ed[curr];
        }
        else
            ed[curr] = i;
    }

    void build()
    {
        queue<int> que;
        for (int i = 0; i < 26; i++)
        {
            if (tree[0][i] != 0)
                que.push(tree[0][i]);
        }

        while (!que.empty())
        {
            int curr = que.front();
            que.pop();

            for (int i = 0; i < 26; i++)
            {
                if (tree[curr][i] == 0)
                {
                    tree[curr][i] = tree[fail[curr]][i];
                }
                else
                {
                    fail[tree[curr][i]] = tree[fail[curr]][i];
                    que.push(tree[curr][i]);
                }
            }
        }
    }

    void qeury(char s[])
    {
        int status = 0;
        for (int i = 0; s[i] != '\0'; i++)
        {
            status = tree[status][s[i] - 'a'];
            mem[status]++;
        }
    }

    void topicsort(int i)
    {
        vis[i] = true;

        if (!vis[fail[i]])
            topicsort(fail[i]);

        sta.push(i);
    }

    void hand()
    {
        for (int i = 0; i <= tot; i++)
        {
            if (!vis[i])
                topicsort(i);
        }

        while (!sta.empty())
        {
            int curr = sta.top();
            sta.pop();
            if (ed[curr] != 0)
                cnt[ed[curr]] += mem[curr];
            if (fail[curr] != 0)
            {
                mem[fail[curr]] += mem[curr];
            }
        }
    }
} ac;

int main()
{
    int n;
    scanf("%d", &n);
    for (int i = 1; i <= n; i++)
    {
        scanf("%s", str[i]);
        ac.insert(str[i], i);
    }
    ac.build();

    for (int i = 1; i <= n; i++)
    {
        ac.qeury(str[i]);
    }

    ac.hand();

    for (int i = 1; i <= n; i++)
    {
        if (ac.mp[i] != 0)
        {
            printf("%d\n", ac.cnt[ac.mp[i]]);
        }
        else
        {
            printf("%d\n", ac.cnt[i]);
        }
    }
    return 0;
}

P3121 粘滞字符串删除

#include <bits/stdc++.h>
#define FR freopen("in.txt", "r", stdin)
using namespace std;
typedef unsigned long long ll;

#define ID(x) (x - 'a')

int tree[200005][26];
int fail[200005];
int dep[200005];
int tot = 0;
bool ed[200005];
int sta[200005];
int bak[200005];
int top = 0;

void insert(string &str)
{
    int curr = 0;
    for (int i = 0; i < str.size(); i++)
    {
        if (tree[curr][ID(str[i])] == 0)
        {
            tree[curr][ID(str[i])] = ++tot;
            dep[tot] = i + 1;
        }

        curr = tree[curr][ID(str[i])];
    }
    ed[curr] = true;
}

void buildAC()
{
    queue<int> que;
    for (int i = 0; i < 26; i++)
    {
        if (tree[0][i] != 0)
            que.push(tree[0][i]);
    }

    while (!que.empty())
    {
        int curr = que.front();
        que.pop();

        for (int i = 0; i < 26; i++)
        {
            if (tree[curr][i] == 0)
            {
                tree[curr][i] = tree[fail[curr]][i];
            }
            else
            {
                fail[tree[curr][i]] = tree[fail[curr]][i];
                que.push(tree[curr][i]);
            }
        }
    }
}

int main()
{
    string txt;
    cin >> txt;
    int n;
    cin >> n;

    for (int i = 0; i < n; i++)
    {
        string word;
        cin >> word;
        insert(word);
    }

    buildAC();
    int curr = 0;
    for (int i = 0; i < txt.size(); i++)
    {
        int k = ID(txt[i]);
        curr = tree[curr][ID(txt[i])];
        sta[top++] = i;
        bak[i] = curr;
        if (ed[curr])
        {
            top -= dep[curr];
            curr = bak[sta[top - 1]];
        }
    }

    for (int i = 0; i < top; i++)
    {
        cout << txt[sta[i]];
    }
    return 0;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值