AC自动机学习

最新推荐文章于 2019-08-02 17:39:08 发布

Lanifer

最新推荐文章于 2019-08-02 17:39:08 发布

阅读量691

点赞数

分类专栏： AC自动机 UVA 文章标签： AC自动机

本文链接：https://blog.csdn.net/lg_csu/article/details/23222629

版权

UVA 同时被 2 个专栏收录

22 篇文章 0 订阅

订阅专栏

AC自动机

1 篇文章 0 订阅

订阅专栏

1. LA 4670 Dominating Patterns

题意：在一个文本串中找出出现次数最多的模式串. 数据范围： text_len <= 1000000 , word_len <= 70 , word_cnt <= 150

解法： Aho-corasick （AC自动机）

说明：与KMP算法类似，都是通过构造一个失配转移自动机。不同的是KMP直接在数组上操作，而AC自动机是在字典树上操作。与KMP算法的比较及一些细节参见代码中说明。

白书模板：

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <queue>
using namespace std;
const int text_len = 1000050 , word_len = 75 , word_num = 155 ;
const int sigma_size = 26 , max_node = word_num * 80 ;

char word[word_num][word_len] , text[text_len] ;
int cnt[word_num] ;

int son[max_node][sigma_size] ,flag[max_node] , node_cnt ;
int f[max_node] , last[max_node];   // last 指向上个有flag标志的节点。 目的是简化失配过程。

void init(){
    node_cnt = 0;
    flag[0] = 0 ;
    memset(son[0] ,0 ,sizeof(son[0]));
}

inline int idx(char c) {return c - 'a' ; }

void Insert(char *s , int id){   // 字典树插入操作 。
    int n = strlen(s) , u = 0;
    for(int i = 0; i<n; i++){
        int v = idx(s[i]) ;
        if(!son[u][v]) {
            son[u][v] = ++ node_cnt ;
            flag[node_cnt] = 0 ;
            memset(son[node_cnt] , 0 ,sizeof(son[0])) ;
        }
        u = son[u][v] ;
    }
    flag[u] = id ;  // 做标记， 该模式串的标号是id
 }

void GetFail(){  // 计算失配函数 ， 即构造状态转移图 。 是算法的重点！
    f[0] = last[0] = 0 ;   // 
    queue<int>Q ;
    for(int c = 0; c<sigma_size ;c++){
        int u = son[0][c] ;
        if(u) f[u] = last[u] = 0 , Q.push(u) ;   // 相当于KMP里 将f[1]置零 
    }
    while(!Q.empty()) {
        int r = Q.front() ; Q.pop() ;
        for(int c=0; c<sigma_size; c++) {
            int u = son[r][c] ;  // u 是 r 的下一个节点 .
            if(!u) {son[r][c] = son[f[r]][c]; continue ; } // 直接沿失配边跳到前面的节点。缩短失陪过程
            Q.push(u) ;
            int v = f[r] ;   // 由f[r]计算f[u] ,  相当于KMP里从f[i]算f[i+1]
            while(v && !son[v][c]) v = f[v] ;
            f[u] = son[v][c] ; 
            last[u] = flag[f[u]] ? f[u] : last[f[u]] ;
        }
    }
}

void Find(char *s){   // 查找操作
     memset(cnt ,0 ,sizeof(cnt));
     int n = strlen(s) ;
     int j = 0 ;
     for(int i=0; i<n; i++){
        int c = idx(s[i]) ;
        // while(j && !son[j][c]) j = f[j] ;
        j = son[j][c] ;
        int u = j;  // 即使这个节点处没有flag标志 ， 也可能匹配到了另外某个模式串。 
        while(u){
            if(flag[u]) cnt[flag[u]] ++ ;
            u = last[u] ;
        }
     }
}

int main()
{
    int n ;
    while(scanf("%d" , &n)==1 && n){
        init() ;
        for(int i=1; i<=n; i++) scanf("%s" , word[i]) , Insert(word[i] , i) ; // 由于本题数据较弱，没有相同的模式串 ，故直接插入。 否则需要建立字符串到整数的映射。
        GetFail() ;
        scanf("%s" , text) ;
        Find(text) ;
        int M = 0 ;
        for(int i=1; i<=n; i++) M = max(M , cnt[i]) ;
        printf("%d\n" , M) ;
        for(int i=1; i<=n ;i++)
            if(cnt[i] == M) printf("%s\n" , word[i]) ;
    }
    return 0;
}

2. UVA - 11468Substring

需要改造AC自动机，将不能往下走的点接回失配点。并在自动机中标记所有的单词节点，注意存在下面这种情况：

2个单词： he , sheil

沿着 s -> h -> e 走到这个节点时，同样是单词节点。

然后可以动态规划解决： dp[i , j] 当前在节点i ，还需要走j 步 , 转移是不经过单词节点，一步步往前走。

dp[ i , j ] = sum { P(第x个字符第概率) * dp[ i 的第x个儿子 ] [ j-1] }

代码：

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <queue>
using namespace std;
const double eps = 1e-6 ;
const int maxn = 25 , sigma_size = 62 ;

char word[maxn][maxn] ;
double prob[sigma_size] ;
bool notzero[sigma_size] ;
int son[maxn*20][sigma_size] , f[maxn*20] ,  is_word[maxn*20] , tot ;
double dp[maxn*20][110] ;
bool vis[maxn*20][110] ;

int new_node(){
    ++tot ;
    is_word[tot] = 0 ;
    memset(son[tot] ,0 ,sizeof(son[0]));
    return tot ;
}
inline int idx(char c){
    if('a' <=c && c<='z') return c - 'a' ;
    else if('A' <=c && c<='Z') return c - 'A' + 26 ;
    else return c - '0' + 52 ;
}
void init(){
    tot = 0 ;
    memset(notzero ,0 ,sizeof(notzero)) ;
    memset(prob ,0 ,sizeof(prob)) ;
    f[0] = is_word[0] = 0;
    memset(son[0] , 0 ,sizeof(son[0])) ;
}
void Insert(char *s) {
    int u = 0 , n = strlen(s) ;
    for(int i=0; i<n; i++){
        int c = idx(s[i]) ;
        if(!son[u][c]) son[u][c] = new_node() ;
        u = son[u][c] ;
    }
    is_word[u] = 1 ;
}
void GetFail(){
    f[0] = 0 ;
    queue<int>Q ;
    for(int c=0 ;c<sigma_size; c++){
        int u = son[0][c] ;
        if(u) Q.push(u) , f[u] = 0;  // if(u) cout<<" debug "<< u<<endl ;
    }
    while(!Q.empty()) {
        int r = Q.front() ; Q.pop() ;
        for(int c =0 ; c <sigma_size ;c++){
            int u = son[r][c] ;                   //  cout<<"now is " << u<<endl ;
            if(!u) {son[r][c] = son[f[r]][c] ; continue ; }
            Q.push(u);
            int v = f[r] ;
            while(v && !son[v][c]) v = f[v] ;
            f[u] = son[v][c] ;
            is_word[u]  |= is_word[f[u]] ;   //  cout<<"debug: "<<u<<"  "<<is_word[u]<<endl ;
        }
    }
}

double DP(int u , int L){
    if(vis[u][L]) return dp[u][L];
    if(L == 0 ) return dp[u][L] = 1.0 ;
    vis[u][L] = 1 ;
    double &ans = dp[u][L] ;
    ans = 0.0 ;
    for(int i=0; i<sigma_size ; i++){
        if(!is_word[ son[u][i] ] && notzero[i]) ans += prob[i] * DP(son[u][i] , L-1) ;
    }
    return ans ;
}

int main()
{
   // freopen("in.txt","r",stdin);
    int T , cas = 1 ;
    scanf("%d" ,&T) ;
    while(T--){
        int n ;
        init() ;
        scanf("%d" ,&n);
        for(int i=0; i<n ;i++){
            scanf("%s" , word[i]) ;
            Insert(word[i]) ;
        }
        GetFail() ;

        char ch[5] ;
        double pt ;
        scanf("%d" ,&n) ;
        for(int i=0; i<n;i++){
            scanf("%s%lf" , ch , &pt) ;
            notzero[idx(ch[0])] = 1;
            prob[idx(ch[0])] = pt ;
        }
        memset(vis , 0 ,sizeof(vis)) ;
        int L ;
        scanf("%d" ,&L) ;
        double ans = DP(0 , L) ;
        printf("Case #%d: %f\n" , cas++ , ans) ;
    }
    return 0;
}

3. UVA - 11019 Matrix Matcher

二维的模式匹配，求其一个大的字符矩阵T内有多少个小的字符矩阵P。

方法：将二位模式匹配转换为一维的模式匹配，然后就能利用一维的AC自动机等方法解决。

将模式矩阵P的每行当成一个字符串构建自动机。
依次匹配T的每行，用cnt[ i ][ j ] 记录以i，j为左上顶点且与 P大小相同的矩形框内能匹配到P的多少行。

明显cnt[ ][ ] >= P的行数为一个二维的匹配。

注意： P可以出现相同的两行。

有大神居然hash解决了。。。

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <queue>
using namespace std;
const int maxn = 1010 , sigma_size = 26 ;

int N , M , X , Y ;

char text[maxn][maxn] , word[110][110] ;
int son[100*100][100] ,f[100*100], num[100*100] ,row[100*100][100] ,last[100*100], tot ;
int cnt[maxn][maxn] ;

inline int idx(char c) {return c - 'a'; }
void ac_init(){
    tot = 0;
    memset(son[0] , 0, sizeof(son[0]));
    num[0] = 0;
}
int new_node(){
    ++tot ;
    memset(son[tot] , 0, sizeof(son[0]));
    num[tot] = 0;
    return tot ;
}
void Insert(char *s , int n , int id) {
    int u = 0 ;
    for(int i=0; i<n; i++){
        int c = idx(s[i]) ;
        if(!son[u][c]) son[u][c] = new_node() ;
        u = son[u][c] ;
    }
    row[u][ num[u]++ ] = id;
}
void GetFail(){
    last[0] = f[0] = 0;
    queue<int>Q ;
    for(int i=0 ; i<sigma_size ;i++){
        int u = son[0][i] ;
        if(u) Q.push(u) , f[u] = 0 , last[u] = 0;
    }
    while(!Q.empty()) {
        int r = Q.front(); Q.pop() ;
        for(int c = 0 ;c<sigma_size ;c++) {
            int u = son[r][c] ;
            if(!u) {son[r][c] = son[ f[r] ][c] ;  continue ;}
            Q.push(u) ;
            int v = f[r] ;
            while(v && !son[v][c]) v = f[v] ;
            f[u] = son[v][c] ;
            last[u] = num[f[u]] ? f[u] : last[f[u]] ;
        }
    }
}
void Match(char *s , int n ,int r){
    int u = 0;
    for(int i = 0; i<n; i++) {
        int c = idx(s[i]) ;
        u = son[u][c] ;
        int p =  u;
        while(p) {
            if(num[p]) {
                for(int k=0; k<num[p] ;k++) {
                    int rr = row[p][k] ;
                    if(r >= rr) cnt[r-rr][i-Y+1] ++;
                }
            }
            p = last[p] ;
        }
    }
}

int main()
{
  //  freopen("in.txt","r",stdin);
    int T ;
    scanf("%d" ,&T) ;
    while(T--){
        scanf("%d%d" ,&N ,&M) ;
        ac_init() ;
        for(int i=0; i<N ;i++) {
            scanf("%s" , text[i]) ;
        }
        scanf("%d%d" ,&X ,&Y) ;
        for(int i=0; i<X ;i++) {
            scanf("%s" , word[i]) ;
            Insert(word[i] , Y , i) ;
        }
        GetFail() ;
        memset(cnt , 0 ,sizeof(cnt)) ;
        for(int i=0; i<N; i++)
            Match(text[i] , M , i) ;
        int sum = 0;
        for(int i=0; i<N-X+1; i++)
            for(int j=0; j<M-Y+1;j++)
                if(cnt[i][j] == X) sum++;
        printf("%d\n" , sum) ;
    }
    return 0;
}