题目链接
题目
It’s well known that DNA Sequence is a sequence only contains A, C, T and G, and it’s very useful to analyze a segment of DNA Sequence,For example, if a animal’s DNA sequence contains segment ATC then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don’t contain those segments.
Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G,and the length of sequences is a given integer n.
Input
First line contains two integer m (0 <= m <= 10), n (1 <= n <=2000000000). Here, m is the number of genetic disease segment, and n is the length of sequences.
Next m lines each line contain a DNA genetic disease segment, and length of these segments is not larger than 10.
Output
An integer, the number of DNA sequences, mod 100000.
Sample Input
4 3
AT
AC
AG
AA
Sample Output
36
题意
求不包含特定字符串的长为n的字符串的总数
分析
AC自动机的另一大应用。AC自动机是一种有限状态转化机,将每一个结点看作一种状态:一个路径字符串(从根结点到该结点)。根据本题要求,每个状态最多可以向四种状态转化,添加’A‘、’T’、‘G’、’C‘形成新的路径字符串。关键是使得添加后的字符串不包含病毒字符串。
可以通过建立AC自动机的过程确定哪些路径字符串包含病毒字符串。首先每一个病毒字符串本身一定是,然后根据后缀结点的意义,来从浅层(短字符串)向深层(长字符串)推导。如果结点u的后缀结点是包含病毒字符串的路径字符串的尾结点,那么u的路径字符串的后缀与后缀结点的路径字符串重合,一定也包含病毒字符串。
AC代码
//719ms 0.8MB
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <set>
#include <queue>
typedef long long ll;
using namespace std;
const int mod=100000;
const int maxn=108; //单词长度*单词数,结点总数
const int maxc=4; //树分支数
ll n;
//在trie上添加后缀结点
//后缀结点:x的后缀结点为非x结点y,
//y的路径字符串为x的路径字符串的最长后缀且y的路径字符串是单词
struct ac_au
{
int child[maxn][maxc];//trie
int fail[maxn];//后缀结点
int sta[maxn];//是否为含病毒字符串的尾结点
int cnt;//结点的个数
int root; //根结点
int newnode()
{
for(int i=0;i<maxc;i++)
child[cnt][i]=-1;
sta[cnt++]=0;
return cnt-1;
}
void init()
{
cnt=0;
root=newnode();
}
int getid(char ch)
{
if(ch=='A') return 0;
if(ch=='T') return 1;
if(ch=='G') return 2;
if(ch=='C') return 3;
}
void insert(char *s)//trie构建
{
int len=strlen(s),p=0;
for(int i=0;i<len;i++)
{
int c=getid(s[i]);//第c个儿子
int &num=child[p][c];
if(num==-1)
num=newnode();
p=num;
}
sta[p]=1;//是病毒字符串的尾结点
}
void build()//后缀结点的添加
{
int p=0;
queue<int> que;
for(int i=0;i<maxc;i++)
{
int &num=child[p][i];
if(num==-1)
num=p;//为下文铺垫,和树意义无关
else
{
fail[num]=p;//有点多余,初始化都是0,理解算法过程有用
que.push(num);
}
}
while(!que.empty())
{
p=que.front();que.pop();
int x=fail[p];//p的后缀结点为x
//后缀结点的确定是通过bfs,由浅层到深层,故每一个在队列中的结点都确定了后缀结点
if(sta[x]) sta[p]=1;//其后缀结点为含病毒字符串尾结点,
//说明p也是含病毒字符串尾结点,可能不是单词,但一定包含单词
for(int i=0;i<maxc;i++)
{
int &num=child[p][i];
if(num==-1)
num=child[fail[p]][i];
else
{
fail[num]=child[fail[p]][i];
que.push(num);
}
}
}
}
}ac;
struct matrix
{
int m[110][110],n;
matrix()
{
for(int i=0;i<ac.cnt;i++)
for(int j=0;j<ac.cnt;j++)
m[i][j]=0;
}
};
matrix getMatrix()
{
matrix A=matrix();
for(int i=0; i<ac.cnt; i++) //
for(int j=0; j<4; j++) //四个儿子结点,即有向图的四条边
{
//如果从u结点走到v结点不会使得字符串变成含病毒字符串,则合法,可以连边
int u=i;
int v=ac.child[u][j];
if(!ac.sta[u] && !ac.sta[v])//从合法结点到合法结点,这条路合法
A.m[u][v]++;
}
return A;
}
matrix mul_mod(matrix A,matrix B)
{
matrix C;
for(int i=0; i<ac.cnt; i++)
for(int j=0; j<ac.cnt; j++)
{
C.m[i][j]=0;
for(int k=0; k<ac.cnt; k++)
{
int tmp=(long long)A.m[i][k]*B.m[k][j]%mod;
C.m[i][j]=(C.m[i][j]+tmp)%mod;
}
}
return C;
}
matrix pow_mod(matrix A,ll k)//矩阵快速幂
{
matrix ans=A;
matrix x=A;
k--;
while(k)
{
if(k&1)
ans=mul_mod(ans,x);
k>>=1;
x=mul_mod(x,x);
}
return ans;
}
char s[105];
int main()
{
ll k;
while(~scanf("%lld%lld",&n,&k))
{
ac.init();
for(int i=1; i<=n; i++)
{
scanf("%s",s);
ac.insert(s);
}
ac.build();
matrix A=getMatrix();
matrix ans=pow_mod(A,k);
ll sum=0;
for(int i=0;i<ac.cnt;i++)
sum=(sum+ans.m[0][i])%mod;
printf("%lld\n",sum);
}
return 0;
}