C++算法 - AC自动机

最新推荐文章于 2024-07-04 11:10:43 发布

水源菌

最新推荐文章于 2024-07-04 11:10:43 发布

阅读量1.9k

点赞数 3

分类专栏： C++算法竞赛文章标签： c++ 算法广度优先

本文链接：https://blog.csdn.net/LNY2022/article/details/130655584

版权

C++算法竞赛专栏收录该内容

3 篇文章 0 订阅

订阅专栏

介绍

与KMP类似，AC自动机也是用来处理字符串匹配的问题。与KMP不同的是，KMP用来处理单模式串问题，即问模式串 $T$ 是否是主串 $S$ 的字串，而AC自动机则能处理多模式串的问题。
AC自动机处理的常见问题如：给出 $n$ 个单词 $T_i$ ，再给出一段文章 $S$ ，问有多少个单词在文章里出现了。

AC自动机的构造过程

建立模式串的字典树（Trie）
给Trie添加失配路径
根据AC自动机，搜索待处理的文本

举个例子：

Q：给定5个单词：say she shr he her，然后给定一个字符串yasherhs。问一共有多少个单词在这个字符串中出现过。

构建字典树

为了便于匹配模式串与主串，我们要先将所有模式串转换为一个字典树，如下图：

其中 $roo t$ 是根结点，绿色的结点表示单词的结尾，用一个结构体数组来储存，代码如下：

struct node {
	node *next[26]; // Trie中的每个结点最多有26个子结点，分别代表'a'~'z'
	int count; // 表示是否为单词的最后一个结点，是则为1，不是则为0
	node() { // 初始化函数
		count = 0; 
		memset(next, NULL, sizeof(next)); 
	}
} *q[15];

void insert(char *str, node *root) { // 构建Trie
	node *p = root; 
	int i = 0, index; 
	while (str[i]) {
		index = str[i] - 'a'; 
		if (p -> next[index] == NULL) 
			p -> next[index] = new node();
			p = p -> next[index]; 
			i++;
	}
	p->count++;
}

构建失配指针

Q：为什么要构建失配指针？
A：AC自动机在匹配时如果当前字符匹配失败，那么可以用失配指针进行跳转。跳转后的串的前缀必为跳转前的模式串的后缀。这样可以大大减少重复判断的次数，降低时间复杂度。

根据此原理，我们可以给每个结点构建一个失配指针，如果找不到能够跳转的串则指向根结点，根结点的失配指针指向 $N ULL$ ，如图：

图中虚线箭头表示的就是失配指针。

从这张图中，我们可以发现： 失配指针是由其父结点的失配指针决定的，如果其父结点的失配指针指向的结点有与其值相同的子结点，那么它的失配指针就指向这个子结点。 因为字典树中兄弟结点是不会重复的，所以我们同样可以确定： 每个结点的失配指针是确定的（有且仅有一个）。 同时，因为每个结点的失配指针是由其父结点决定的，所以一个结点失配指针指向的结点深度一定小于这个结点的深度，又因为一个结点的失配指针指向的结点不一定在从根结点到这个点的路径上，所以我们需要从根结点开始，一个层次，一个层次地构建失配指针。那么，我们联想到了广度优先搜索（BFS）算法。

首先，我们需要给原来的结构体 $n o d e$ 添加一个属性 $f ai l$ ：

struct node {
	node *fail; // 记录失配指针
	node *next[26]; 
	int count;
	node() { 
		count = 0; 
		memset(next, NULL, sizeof(next)); 
	}
} *q[15];

然后我们用广度优先搜索实现构建失配指针，代码如下：

int head, tail;
void buildAcAutomation(node *root) {
	int i; 
	root -> fail = NULL;
	q[head++] = root;
	while (head != tail) {
		node *temp = q[tail++]; 
		node *p = NULL;
		for (i = 0; i < 26; i++) {
			if (temp -> next[i] != NULL) {
				if (temp == root)
					temp -> next[i] -> fail = root; 
				else {
					p = temp -> fail;
					while (p != NULL) {
						if (p -> next[i] != NULL) {
							temp -> next[i] -> fail = p -> next[i];
							break;
						}
						p = p -> fail;
					}
					if (p == NULL) 
						temp -> next[i] -> fail = root;
				}
				q[head++] = temp -> next[i];
			}
		}
	}
}

此处的广度优先搜索是使用指针变量的方式实现的，同样可以用队列的方式来实现，可参考C++一本通基础算法：广度优先搜索（BFS）。

匹配

最后，我们便可以在AC自动机上查找模式串中出现过哪些单词了。匹配过程分两种情况：

当前字符匹配，表示从当前结点沿着树边有一条路径可以到达目标字符，此时只需沿该路径走向下一个结点继续匹配即可，目标字符串指针移向下个字符继续匹配；
当前字符不匹配，则去当前结点失配指针所指向的字符继续匹配，匹配过程随着指针指向 $roo t$ 结束。

重复这两个过程中的任意一个，知道模式串走到结尾为止。

匹配的代码如下：

char str[15];
int query(node *root) {
	int i = 0, cnt = 0, index, len = strlen(str); 
	node *p = root;
	while (str[i]) {
		index = str[i] - 'a';
		while (p -> next[index] == NULL && p != root)
			p = p -> fail;
		p = p -> next[index]; 
		p = (p == NULL) ? root : p;
		node *temp = p;
		while (temp != root && temp -> count != -1) {
			cnt += temp -> count; 
			temp -> count = -1; 
			temp = temp -> fail; 
		} 
		i++; 
	}
	return cnt; 
}

其中 $c n t$ 的值就是这个字符串中出现过得单词个数。

完整代码

#include <bits/stdc++.h>
using namespace std;
struct node {
	node *fail;
	node *next[26]; 
	int count;
	node() { 
		count = 0; 
		memset(next, NULL, sizeof(next)); 
	}
} *q[15];
char t[15];
char str[15];
int head, tail;
void insert(char *str, node *root) {
	node *p = root; 
	int i = 0, index; 
	while (str[i]) {
		index = str[i] - 'a'; 
		if (p -> next[index] == NULL) 
			p -> next[index] = new node();
			p = p -> next[index]; 
			i++;
	}
	p->count++;
}
void buildAcAutomation(node *root) {
	int i; 
	root -> fail = NULL;
	q[head++] = root;
	while (head != tail) {
		node *temp = q[tail++]; 
		node *p = NULL;
		for (i = 0; i < 26; i++) {
			if (temp -> next[i] != NULL) {
				if (temp == root)
					temp -> next[i] -> fail = root; 
				else {
					p = temp -> fail;
					while (p != NULL) {
						if (p -> next[i] != NULL) {
							temp -> next[i] -> fail = p -> next[i];
							break;
						}
						p = p -> fail;
					}
					if (p == NULL) 
						temp -> next[i] -> fail = root;
				}
				q[head++] = temp -> next[i];
			}
		}
	}
} 
int query(node *root) {
	int i = 0, cnt = 0, index, len = strlen(str); 
	node *p = root;
	while (str[i]) {
		index = str[i] - 'a';
		while (p -> next[index] == NULL && p != root)
			p = p -> fail;
		p = p -> next[index]; 
		p = (p == NULL) ? root : p;
		node *temp = p;
		while (temp != root && temp -> count != -1) {
			cnt += temp -> count; 
			temp -> count = -1; 
			temp = temp -> fail; 
		} 
		i++; 
	}
	return cnt; 
} 
int main() {
	int n; 
	node *root = new node();  
	scanf("%d", &n);
	for(int i = 1; i <= n; i++) {
		scanf("%s", &t); 
		insert(t, root); 
	}
	scanf("%s", &str);
	buildAcAutomation(root); 
	printf("%d\n", query(root));
	return 0; 
}