【AC自动机 && 哪些单词在文本中出现（单词可能重复，重复算多个）】HDU - 2222 Keywords Search

最新推荐文章于 2019-04-12 10:45:33 发布

笑对这个世界的志贵

最新推荐文章于 2019-04-12 10:45:33 发布

阅读量221

点赞数

分类专栏： AC自动机

AC自动机专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Step1 Problem:

哪些单词在文本中出现（给出单词可能重复，重复算多个）
数据范围：
单词个数 <= 10000, 单词长度 <= 50, 文本长度 <= 1e6.

Step2 Ideas:

AC自动机和 KMP 很相似：KMP 有 next 数组，AC 自动机有 fail 指针。
含义都是当前点所对应的字符串，当前点可以移动到最长前缀 == 后缀的前缀位置处。
存储多个单词用到的结构是字典树，因为 fail 所指的位置由父亲决定，所以用到bfs。
匹配的过程中：单词在文本出现过后，我们需要标记一下，因为接下来再出现，我们也不需要了。

Step3 Code:

#include<bits/stdc++.h>
using namespace std;
const int N = 1e6+5;
const int M = 5e5+5;
struct node
{
    int data;
    node *next[26], *fail;
};
node a[M];
char s1[N];
int top;
node *creat_kong()//获取一个空的节点
{
    node *root = &a[top++];
    root->fail = NULL;
    for(int i = 0; i < 26; i++)
        root->next[i] = NULL;
    root->data = 0;
    return root;
}
void Insert(node *root, char s[])//将单词插入字典树
{
    node *p = root;
    int len = strlen(s);
    for(int i = 0; i < len; i++)
    {
        int tmp = s[i]-'a';
        if(!p->next[tmp]) p->next[tmp] = creat_kong();
        p = p->next[tmp];
    }
    p->data++;
}
void get_fail(node *root)//获取失配指针的指向
{
    queue<node*> q;
    q.push(root);
    while(!q.empty())
    {
        node *p = q.front(); q.pop();
        for(int i = 0; i < 26; i++)
        {
            if(!p->next[i]) continue;
            if(p == root) p->next[i]->fail = root;//父亲为根节点，直接指向根即可
            node *tmp = p->fail;
            while(tmp && !tmp->next[i]) tmp = tmp->fail;//找到父亲失配指针指向的位置，对应下一位不为空
            if(!tmp) p->next[i]->fail = root;//如果找不到，直接指向根
            else p->next[i]->fail = tmp->next[i];//找到了
            q.push(p->next[i]);
        }
    }
}
int mat(node *root, char s[])
{
    node *p = root;
    int len = strlen(s);
    int ans = 0;
    for(int i = 0; i < len; i++)
    {
        int tmp = s[i]-'a';
        while(!p->next[tmp] && p != root) p = p->fail;
        if(p == root && !p->next[tmp]) continue;
        p = p->next[tmp];
        node *q = p;
        while(q != root && q->data != -1)
        {
            ans += q->data;
            q->data = -1;//因为每个单词在文本出现过一次后，我们就不需要了。
            q = q->fail;
        }
    }
    return ans;
}
int main()
{
    int T, n;
    scanf("%d", &T);
    char s2[100];
    while(T--)
    {
        scanf("%d", &n);
        top = 0;
        node *root = creat_kong();
        for(int i = 0; i < n; i++)
        {
            scanf("%s", s2);
            Insert(root, s2);
        }
        get_fail(root);
        scanf("%s", s1);
        printf("%d\n", mat(root, s1));
    }
    return 0;
}