【AC自动机+矩阵快速幂】DNA Sequence POJ - 2778（求不包含特定字符串的长为n的字符串的总数）

最新推荐文章于 2021-09-05 00:39:13 发布

EnjoyingAC

最新推荐文章于 2021-09-05 00:39:13 发布

阅读量335

点赞数

分类专栏：图论数据结构 AC自动机矩阵快速幂

本文链接：https://blog.csdn.net/qq_37685156/article/details/80476847

版权

数据结构同时被 3 个专栏收录

78 篇文章 0 订阅

订阅专栏

图论

77 篇文章 0 订阅

订阅专栏

AC自动机

8 篇文章 0 订阅

订阅专栏

题目链接

DNA Sequence POJ - 2778

题目

It’s well known that DNA Sequence is a sequence only contains A, C, T and G, and it’s very useful to analyze a segment of DNA Sequence，For example, if a animal’s DNA sequence contains segment ATC then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don’t contain those segments.

Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G，and the length of sequences is a given integer n.
Input
First line contains two integer m (0 <= m <= 10), n (1 <= n <=2000000000). Here, m is the number of genetic disease segment, and n is the length of sequences.

Next m lines each line contain a DNA genetic disease segment, and length of these segments is not larger than 10.
Output
An integer, the number of DNA sequences, mod 100000.
Sample Input
4 3
AT
AC
AG
AA
Sample Output
36

题意

求不包含特定字符串的长为n的字符串的总数

分析

AC自动机的另一大应用。AC自动机是一种有限状态转化机，将每一个结点看作一种状态：一个路径字符串（从根结点到该结点）。根据本题要求，每个状态最多可以向四种状态转化，添加’A‘、’T’、‘G’、’C‘形成新的路径字符串。关键是使得添加后的字符串不包含病毒字符串。

可以通过建立AC自动机的过程确定哪些路径字符串包含病毒字符串。首先每一个病毒字符串本身一定是，然后根据后缀结点的意义，来从浅层（短字符串）向深层（长字符串）推导。如果结点u的后缀结点是包含病毒字符串的路径字符串的尾结点，那么u的路径字符串的后缀与后缀结点的路径字符串重合，一定也包含病毒字符串。

AC代码

//719ms 0.8MB
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <set>
#include <queue>
typedef long long ll;
using namespace std;
const int mod=100000;
const int maxn=108; //单词长度*单词数,结点总数
const int maxc=4; //树分支数
ll n;

//在trie上添加后缀结点
//后缀结点：x的后缀结点为非x结点y，
//y的路径字符串为x的路径字符串的最长后缀且y的路径字符串是单词
struct ac_au
{
    int child[maxn][maxc];//trie
    int fail[maxn];//后缀结点
    int sta[maxn];//是否为含病毒字符串的尾结点
    int cnt;//结点的个数
    int root; //根结点
    int newnode()
    {
        for(int i=0;i<maxc;i++)
            child[cnt][i]=-1;
        sta[cnt++]=0;
        return cnt-1;
    }
    void init()
    {
        cnt=0;
        root=newnode();
    }
    int getid(char ch)
    {
        if(ch=='A') return 0;
        if(ch=='T') return 1;
        if(ch=='G') return 2;
        if(ch=='C') return 3;
    }
    void insert(char *s)//trie构建
    {
        int len=strlen(s),p=0;
        for(int i=0;i<len;i++)
        {
            int c=getid(s[i]);//第c个儿子
            int &num=child[p][c];
            if(num==-1)
                num=newnode();
            p=num;
        }
        sta[p]=1;//是病毒字符串的尾结点
    }
    void build()//后缀结点的添加
    {
        int p=0;
        queue<int> que;
        for(int i=0;i<maxc;i++)
        {
            int &num=child[p][i];
            if(num==-1)
                num=p;//为下文铺垫，和树意义无关
            else
            {
                fail[num]=p;//有点多余，初始化都是0，理解算法过程有用
                que.push(num);
            }
        }
        while(!que.empty())
        {
            p=que.front();que.pop();
            int x=fail[p];//p的后缀结点为x
            //后缀结点的确定是通过bfs，由浅层到深层，故每一个在队列中的结点都确定了后缀结点
            if(sta[x]) sta[p]=1;//其后缀结点为含病毒字符串尾结点，
            //说明p也是含病毒字符串尾结点，可能不是单词，但一定包含单词
            for(int i=0;i<maxc;i++)
            {
                int &num=child[p][i];
                if(num==-1)
                    num=child[fail[p]][i];
                else
                {
                    fail[num]=child[fail[p]][i];
                    que.push(num);
                }
            }
        }
    }

}ac;
struct matrix
{
    int m[110][110],n;
    matrix()
    {
        for(int i=0;i<ac.cnt;i++)
            for(int j=0;j<ac.cnt;j++)
            m[i][j]=0;
    }
};
matrix getMatrix()
{
    matrix A=matrix();
    for(int i=0; i<ac.cnt; i++) //
        for(int j=0; j<4; j++) //四个儿子结点，即有向图的四条边
        {
            //如果从u结点走到v结点不会使得字符串变成含病毒字符串，则合法，可以连边
            int u=i;
            int v=ac.child[u][j];
            if(!ac.sta[u] && !ac.sta[v])//从合法结点到合法结点，这条路合法
                A.m[u][v]++;
        }
    return A;
}
matrix mul_mod(matrix A,matrix B)
{
    matrix C;
    for(int i=0; i<ac.cnt; i++)
        for(int j=0; j<ac.cnt; j++)
        {
            C.m[i][j]=0;
            for(int k=0; k<ac.cnt; k++)
            {
                int tmp=(long long)A.m[i][k]*B.m[k][j]%mod;
                C.m[i][j]=(C.m[i][j]+tmp)%mod;
            }
        }
    return C;
}
matrix pow_mod(matrix A,ll k)//矩阵快速幂
{
    matrix ans=A;
    matrix x=A;
    k--;
    while(k)
    {
        if(k&1)
            ans=mul_mod(ans,x);
        k>>=1;
        x=mul_mod(x,x);
    }
    return ans;
}
char s[105];
int main()
{

    ll k;
    while(~scanf("%lld%lld",&n,&k))
    {
        ac.init();
        for(int i=1; i<=n; i++)
        {
            scanf("%s",s);
            ac.insert(s);
        }
        ac.build();
        matrix A=getMatrix();
        matrix ans=pow_mod(A,k);
        ll sum=0;
        for(int i=0;i<ac.cnt;i++)
            sum=(sum+ans.m[0][i])%mod;
        printf("%lld\n",sum);
    }
    return 0;
}

EnjoyingAC

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【AC自动机+矩阵快速幂】DNA Sequence POJ - 2778（求不包含特定字符串的长为n的字符串的总数）

题目链接DNA Sequence POJ - 2778题目It’s well known that DNA Sequence is a sequence only contains A, C, T and G, and it’s very useful to analyze a segment of DNA Sequence，For example, if a animal’s DNA...
复制链接

扫一扫

专栏目录