AC自动机
AC自动机(Aho-Corasick Automaton)作为一种KMP的扩展算法,可以在多个模式串的条件下完成对文本串的匹配。
基本原理
AC自动机的底层数据类型是Tire字典树,底层算法为KMP匹配算法,故阅读此文需要Tire树和KMP算法作为前置知识学习。
AC自动机的核心本质还是自动机,主要构建一个状态转移函数 P [ u ] [ v ] P[u][v] P[u][v],表示状态 u u u读取到字符 v v v的下一个状态。
例如,当我们发现G失配之后,我们会进行状态转移,放弃当前模式串匹配,转移到另外一个模式串中。
创建Tire树
AC自动机的状态转移函数是以Tire树为基础创建的,因此在创建AC自动机之前,我们先要创建一个Tire树,很容易,和一般的Tire树创建一样。
struct ACAutomaton
{
int tree[10000005][26];
int tot = 0;
void insert(string &s)
{
int curr = 0;
for (int i = 0; i < s.size(); i++)
{
if (tree[curr][s[i] - 'a'] == 0)
{
tree[curr][s[i] - 'a'] = ++tot;
}
curr = tree[curr][s[i] - 'a'];
}
}
};
计算状态转移函数和fail指针和匹配
这一个步,我们需要计算出完整的状态转移函数 t r e e tree tree和失配指针数组 f a i l fail fail,其中:
- t r e e [ u ] [ v ] tree[u][v] tree[u][v]代表状态 u u u下一个读取字符 v v v之后,所转移到的状态。
- f a i l [ u ] fail[u] fail[u]代表是 u u u的一个后缀的最长字符串的节点。
那么我们就可以写出代码:
struct ACAutomaton
{
int tree[10000005][26];
int fail[10000005];
int ed[10000005];
int tot = 0;
void insert(string &s)
{
int curr = 0;
for (int i = 0; i < s.size(); i++)
{
if (tree[curr][s[i] - 'a'] == 0)
{
tree[curr][s[i] - 'a'] = ++tot;
}
curr = tree[curr][s[i] - 'a'];
}
ed[curr]++;
}
void buildAC()
{
queue<int> que;
for (int i = 0; i < 26; i++)
if (tree[0][i] != 0)
que.push(tree[0][i]);
while (!que.empty())
{
int curr = que.front();
que.pop();
for (int i = 0; i < 26; i++)
{
if (tree[curr][i] == 0)
{
// 后缀转移
tree[curr][i] = tree[fail[curr]][i];
}
else
{
// 失配指针转移
fail[tree[curr][i]] = tree[fail[curr]][i];
que.push(tree[curr][i]);
}
}
}
}
int query(string &txt)
{
int ans = 0;
int status = 0;
for (int i = 0; i < txt.size(); i++)
{
status = tree[status][txt[i] - 'a'];
for (int j = status; j != 0 && ed[j] != -1; j = fail[j])
{
ans += ed[j];
ed[j] = -1;
}
}
return ans;
}
} ac;
例题
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
#define FR freopen("in.txt", "r", stdin)
#define FW freopen("out1.txt", "w", stdout)
struct ACAutomaton
{
int tree[10000005][26];
int fail[10000005];
int ed[10000005];
int tot = 0;
void insert(string &s)
{
int curr = 0;
for (int i = 0; i < s.size(); i++)
{
if (tree[curr][s[i] - 'a'] == 0)
{
tree[curr][s[i] - 'a'] = ++tot;
}
curr = tree[curr][s[i] - 'a'];
}
ed[curr]++;
}
void buildAC()
{
queue<int> que;
for (int i = 0; i < 26; i++)
if (tree[0][i] != 0)
que.push(tree[0][i]);
while (!que.empty())
{
int curr = que.front();
que.pop();
for (int i = 0; i < 26; i++)
{
if (tree[curr][i] == 0)
{
// 后缀转移
tree[curr][i] = tree[fail[curr]][i];
}
else
{
// 失配指针转移
fail[tree[curr][i]] = tree[fail[curr]][i];
que.push(tree[curr][i]);
}
}
}
}
int query(string &txt)
{
int ans = 0;
int status = 0;
for (int i = 0; i < txt.size(); i++)
{
status = tree[status][txt[i] - 'a'];
for (int j = status; j != 0 && ed[j] != -1; j = fail[j])
{
ans += ed[j];
ed[j] = -1;
}
}
return ans;
}
} ac;
int main()
{
int n;
cin >> n;
for (int i = 0; i < n; i++)
{
string s;
cin >> s;
ac.insert(s);
}
ac.buildAC();
string txt;
cin >> txt;
cout << ac.query(txt);
return 0;
}
我们发现每次统计的时候 f a i l fail fail指针会跳转很多次,我们可以先记录下来,然后通过拓扑排序一次性统计。
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
#define FR freopen("in.txt", "r", stdin)
#define FW freopen("out1.txt", "w", stdout)
char str[210][1000005];
struct AC
{
int tree[3000000][26];
int fail[3000000];
int ed[3000000];
int mem[3000000];
bool vis[3000000];
int cnt[205];
int mp[205];
int tot = 0;
stack<int> sta;
void insert(char s[], int i)
{
int curr = 0;
for (int i = 0; s[i] != '\0'; i++)
{
if (tree[curr][s[i] - 'a'] == 0)
{
tree[curr][s[i] - 'a'] = ++tot;
}
curr = tree[curr][s[i] - 'a'];
}
if (ed[curr] != 0)
{
mp[i] = ed[curr];
}
else
ed[curr] = i;
}
void build()
{
queue<int> que;
for (int i = 0; i < 26; i++)
{
if (tree[0][i] != 0)
que.push(tree[0][i]);
}
while (!que.empty())
{
int curr = que.front();
que.pop();
for (int i = 0; i < 26; i++)
{
if (tree[curr][i] == 0)
{
tree[curr][i] = tree[fail[curr]][i];
}
else
{
fail[tree[curr][i]] = tree[fail[curr]][i];
que.push(tree[curr][i]);
}
}
}
}
void qeury(char s[])
{
int status = 0;
for (int i = 0; s[i] != '\0'; i++)
{
status = tree[status][s[i] - 'a'];
mem[status]++;
}
}
void topicsort(int i)
{
vis[i] = true;
if (!vis[fail[i]])
topicsort(fail[i]);
sta.push(i);
}
void hand()
{
for (int i = 0; i <= tot; i++)
{
if (!vis[i])
topicsort(i);
}
while (!sta.empty())
{
int curr = sta.top();
sta.pop();
if (ed[curr] != 0)
cnt[ed[curr]] += mem[curr];
if (fail[curr] != 0)
{
mem[fail[curr]] += mem[curr];
}
}
}
} ac;
int main()
{
int n;
scanf("%d", &n);
for (int i = 1; i <= n; i++)
{
scanf("%s", str[i]);
ac.insert(str[i], i);
}
ac.build();
for (int i = 1; i <= n; i++)
{
ac.qeury(str[i]);
}
ac.hand();
for (int i = 1; i <= n; i++)
{
if (ac.mp[i] != 0)
{
printf("%d\n", ac.cnt[ac.mp[i]]);
}
else
{
printf("%d\n", ac.cnt[i]);
}
}
return 0;
}
#include <bits/stdc++.h>
#define FR freopen("in.txt", "r", stdin)
using namespace std;
typedef unsigned long long ll;
#define ID(x) (x - 'a')
int tree[200005][26];
int fail[200005];
int dep[200005];
int tot = 0;
bool ed[200005];
int sta[200005];
int bak[200005];
int top = 0;
void insert(string &str)
{
int curr = 0;
for (int i = 0; i < str.size(); i++)
{
if (tree[curr][ID(str[i])] == 0)
{
tree[curr][ID(str[i])] = ++tot;
dep[tot] = i + 1;
}
curr = tree[curr][ID(str[i])];
}
ed[curr] = true;
}
void buildAC()
{
queue<int> que;
for (int i = 0; i < 26; i++)
{
if (tree[0][i] != 0)
que.push(tree[0][i]);
}
while (!que.empty())
{
int curr = que.front();
que.pop();
for (int i = 0; i < 26; i++)
{
if (tree[curr][i] == 0)
{
tree[curr][i] = tree[fail[curr]][i];
}
else
{
fail[tree[curr][i]] = tree[fail[curr]][i];
que.push(tree[curr][i]);
}
}
}
}
int main()
{
string txt;
cin >> txt;
int n;
cin >> n;
for (int i = 0; i < n; i++)
{
string word;
cin >> word;
insert(word);
}
buildAC();
int curr = 0;
for (int i = 0; i < txt.size(); i++)
{
int k = ID(txt[i]);
curr = tree[curr][ID(txt[i])];
sta[top++] = i;
bak[i] = curr;
if (ed[curr])
{
top -= dep[curr];
curr = bak[sta[top - 1]];
}
}
for (int i = 0; i < top; i++)
{
cout << txt[sta[i]];
}
return 0;
}