参考博客:kuangbin AC自动机小结,AC自动机算法 海量数据处理之Tire树(字典树)
AC自动机,Aho-Corasick automation 是建立在字典树(Tire)上的多模式串快速匹配算法;
一个典型的例子就是:给出N个单词,和一篇文章,判断文章中出现了多种(个)之前的单词。
要想理解AC自动机必须先学Tire。Tire是一棵k叉树,除根节点之外,每个节点都储存了一个字符(字母),于是从祖先往下看,每一条路径都是一个单词。
AC自动机就是建立在Tire数据结构上的一个算法,类似于在Tire树上做KMP。类似kmp的next指针,它也有一个fail指针,以加速匹配的速度。
它可以判断某单词是否在文章中出现(可重叠),以及出现的次数。
具体原理不赘述,网上各种解释遍地开花。对于偷懒者来说,求个模版就算了……
AC自动机算法主要有三个步骤
1)建Tire树
2)构造fail指针
3)匹配
【模版】(带详细解释)
const int MAXN = 500*200; //模式串串个数*模式串长度
const int MAXL = 10000+10; //原串最大长度
const int MAXM = 128; //Tire树分支个数,即字符种类数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1; //单词"L"初始为-1,表示没有这个单词
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id) //插入模式串,建树;id是模式串的编号,可无
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id; //记住id
}
void build() //求fail指针
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[501];
int query(char buf[],int n,int id) //匹配,某些变化主要在此处;id是文章的编号
{
bool has = false;
mst(num,0);
int len = strlen(buf);
int now = root;
int ct = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
if(end[temp] != -1) //如果单词存在
{
num[ct++] = end[temp]; //num数组存出现过单词的id
//end[temp] = -1; //把temp删除
has = true;
}
temp = fail[temp];
}
}
if(!has) return 0;
}
};
三道入门题,注意字符种类总数。
题意:求出现了多少种单词
【代码】
/* ***********************************************
Author :angon
************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k) memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}
const int MAXN = 500010; //字符串个数
const int MAXL = 1000010; //字符最大长度
const int MAXM = 26; //Tire树分支个数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < 26;i++)
next[L][i] = -1;
end[L++] = 0;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[])
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]-'a'] == -1)
next[now][buf[i]-'a'] = newnode();
now = next[now][buf[i]-'a'];
}
end[now]++;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < 26;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < 26;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int query(char buf[])
{
int len = strlen(buf);
int now = root;
int res = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]-'a'];
int temp = now;
while( temp != root )
{
res += end[temp];
end[temp] = 0;
temp = fail[temp];
}
}
return res;
}
void debug()
{
for(int i = 0;i < L;i++)
{
printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],end[i]);
for(int j = 0;j < 26;j++)
printf("%2d",next[i][j]);
printf("]\n");
}
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int t;scan(t);
while(t--)
{
int n;
scan(n);
ac.init();
REP(i,0,n)
{
scanf("%s",buf);
ac.insert(buf);
}
ac.build();
scanf("%s",buf);
printf("%d\n",ac.query(buf));
}
return 0;
}
题意:要求输出每个单词出现的次数
【代码】
/* ***********************************************
Author :angon
************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k) memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}
const int MAXN = 1010*50; //字符串个数*50
const int MAXL = 2000000+10; //原串最大长度
const int MAXM = 128; //Tire树分支个数
char str[1005][100];
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id)
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[1001];
void query(char buf[],int n)
{
mst(num,0);
int len = strlen(buf);
int now = root;
//int res = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
//res += end[temp];
//end[temp] = 0;
if(end[temp] != -1)
num[end[temp]]++;
temp = fail[temp];
}
}
//return res;
REP(i,0,n)
if(num[i])
printf("%s: %d\n",str[i],num[i]);
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int n;
while(~scan(n))
{
ac.init();
REP(i,0,n)
{
scanf("%s",str[i]);
ac.insert(str[i],i);
}
ac.build();
scanf("%s",buf);
ac.query(buf,n);
}
return 0;
}
题意:输出出现了单词的编号
【代码】
/* ***********************************************
Author :angon
************************************************ */
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <stack>
#include <vector>
#include <queue>
#include <set>
#include <map>
#include <string>
#include <math.h>
#include <stdlib.h>
#include <time.h>
using namespace std;
#define showtime fprintf(stderr,"time = %.15f\n",clock() / (double)CLOCKS_PER_SEC)
#define lld %I64d
#define REP(i,k,n) for(int i=k;i<n;i++)
#define REPP(i,k,n) for(int i=k;i<=n;i++)
#define scan(d) scanf("%d",&d)
#define scanl(d) scanf("%I64d",&d)
#define scann(n,m) scanf("%d%d",&n,&m)
#define scannl(n,m) scanf("%I64d%I64d",&n,&m)
#define mst(a,k) memset(a,k,sizeof(a))
#define LL long long
#define N 1005
#define mod 1000000007
inline int read(){int s=0;char ch=getchar();for(; ch<'0'||ch>'9'; ch=getchar());for(; ch>='0'&&ch<='9'; ch=getchar())s=s*10+ch-'0';return s;}
const int MAXN = 500*200; //模式串串个数*模式串长度
const int MAXL = 10000+10; //原串最大长度
const int MAXM = 128; //Tire树分支个数
struct Trie
{
int next[MAXN][MAXM],fail[MAXN],end[MAXN];
int root,L;
int newnode()
{
for(int i = 0;i < MAXM;i++)
next[L][i] = -1;
end[L++] = -1;
return L-1;
}
void init()
{
L = 0;
root = newnode();
}
void insert(char buf[],int id)
{
int len = strlen(buf);
int now = root;
for(int i = 0;i < len;i++)
{
if(next[now][buf[i]] == -1)
next[now][buf[i]] = newnode();
now = next[now][buf[i]];
}
end[now] = id;
}
void build()
{
queue<int>Q;
fail[root] = root;
for(int i = 0;i < MAXM;i++)
if(next[root][i] == -1)
next[root][i] = root;
else
{
fail[next[root][i]] = root;
Q.push(next[root][i]);
}
while( !Q.empty() )
{
int now = Q.front();
Q.pop();
for(int i = 0;i < MAXM;i++)
if(next[now][i] == -1)
next[now][i] = next[fail[now]][i];
else
{
fail[next[now][i]]=next[fail[now]][i];
Q.push(next[now][i]);
}
}
}
int num[501];
int query(char buf[],int n,int id)
{
bool has = false;
mst(num,0);
int len = strlen(buf);
int now = root;
int ct = 0;
for(int i = 0;i < len;i++)
{
now = next[now][buf[i]];
int temp = now;
while( temp != root )
{
if(end[temp] != -1)
{
num[ct++] = end[temp];
//end[temp] = -1;
has = true;
}
temp = fail[temp];
}
}
if(!has) return 0;
printf("web %d: ",id);
sort(num,num+ct);
REP(i,0,ct)
printf("%d%c",num[i]+1,i==ct-1?'\n':' ');
return 1;
}
};
char buf[MAXL];
Trie ac;
int main()
{
//freopen("in.txt","r",stdin);
//freopen("out.txt","w",stdout);
int n;
while(~scan(n))
{
ac.init();
REP(i,0,n)
{
scanf("%s",buf);
ac.insert(buf,i);
}
ac.build();
int m; scan(m);
int total = 0;
REPP(i,1,m)
{
scanf("%s",buf);
if(ac.query(buf,n,i))
total++;
}
printf("total: %d\n",total);
}
return 0;
}
以上~