1 介绍
trie树:LeetCode-Trie树_hclbeloved的博客-CSDN博客
AC 自动机算法,全称是 Aho-Corasick 算法。其实,Trie 树跟 AC 自动机之间的关系,就像单串匹配中朴素的串匹配算法,跟 KMP 算法之间的关系一样,只不过前者针对的是多模式串而已。所以,AC 自动机实际上就是在 Trie 树之上,加了类似 KMP 的 next 数组,只不过此处的 next 数组是构建在树上的。
2 实现
// Trie4AhoCorasick.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
#include <iostream>
#include <string.h>
#include <queue>
using namespace std;
class Trie4AhoCorasick
{
class AcNode;
public:
Trie4AhoCorasick()
{
root = new AcNode('/'); // 存储无意义字符
}
~Trie4AhoCorasick()
{
destroyTrieNode(root);
}
// 往Trie4AhoCorasick树中插入一个字符串
void insert(const char *text)
{
AcNode *p = root;
int len = strlen(text);
for (int i = 0; i < len; ++i)
{
int index = text[i] - 'a';
if (p->children[index] == NULL)
{
AcNode *newNode = new AcNode(text[i]);
p->children[index] = newNode;
}
p = p->children[index];
}
p->isEndingChar = true;
p->length = len;
}
// 在Trie4AhoCorasick树中查找一个字符串
bool find(const char *pattern)
{
AcNode *p = root;
for (int i = 0; i < strlen(pattern); ++i)
{
int index = pattern[i] - 'a';
if (p->children[index] == NULL)
{
return false; // 不存在pattern
}
p = p->children[index];
}
if (p->isEndingChar == false)
{
return false; // 不能完全匹配,只是前缀
}
else
{
return true; // 找到pattern
}
}
//删除trie中的字符串时,把要删除的字符串在trie树中的ending标记删除就好了
void del(const char *pattern)
{
AcNode *p = root;
for (int i = 0; i < strlen(pattern); ++i)
{
int index = pattern[i] - 'a';
if (p->children[index] == NULL)
{
return; // 不存在pattern
}
p = p->children[index];
}
if (p->isEndingChar == false)
{
return; // 不能完全匹配,只是前缀
}
else
{
// 找到pattern
p->isEndingChar = false;
return;
}
}
//创建Trie4AhoCorasick中的失败指针
void buildFailurePointer()
{
queue<AcNode*> queue4AcNode;
root->fail = NULL;
queue4AcNode.push(root);
while (!queue4AcNode.empty())
{
AcNode *p = queue4AcNode.front();
queue4AcNode.pop();
for (int i = 0; i < 26; ++i)
{
AcNode *pc = p->children[i];
if (pc == NULL)
{
continue;
}
if (p == root)
{
pc->fail = root;
}
else
{
AcNode *q = p->fail;
while (q != NULL)
{
AcNode *qc = q->children[pc->data - 'a'];
if (qc != NULL)
{
pc->fail = qc;
break;
}
q = q->fail;
}
if (q == NULL)
{
pc->fail = root;
}
}
queue4AcNode.push(pc);
}
}
}
//多模式串匹配
void match(const char* text) // text是主串
{
int n = strlen(text);
AcNode *p = root;
for (int i = 0; i < n; ++i)
{
int idx = text[i] - 'a';
while (p->children[idx] == NULL && p != root)
{
p = p->fail; // 失败指针发挥作用的地方
}
p = p->children[idx];
if (p == NULL)
{
p = root; // 如果没有匹配的,从root开始重新匹配主串中的下一个字符
}
AcNode *tmp = p;
while (tmp != root) 打印出可以匹配的模式串
{
if (tmp->isEndingChar == true)
{
int pos = i - tmp->length + 1;
cout << "匹配起始下标: " << pos << "; 长度: " << tmp->length << "; body: ";
for (int i = 0;i<tmp->length; ++i)
{
cout << text[pos + i];
}
cout << endl;
}
tmp = tmp->fail;
}
}
}
private:
void destroyTrieNode(AcNode *node)
{
if (node && !node->isEndingChar)
{
for (int i = 0; i < 26; ++i)
{
destroyTrieNode(node->children[i]);
}
delete[] node->children;
}
}
private:
class AcNode
{
public:
AcNode(char value = '*') : data(value), isEndingChar(false), length(0), fail(NULL)
{
children = new AcNode*[26];
for (int i = 0; i < 26; ++i)
{
children[i] = NULL;
}
}
public:
char data;
bool isEndingChar;
int length; // 当isEndingChar=true时,记录模式串长度
AcNode *fail;
AcNode **children;
};
private:
AcNode *root;
};
int main()
{
//test 1
Trie4AhoCorasick trie;
trie.insert("ho");
trie.insert("how");
trie.insert("hi");
trie.insert("he");
trie.insert("her");
trie.insert("hello");
trie.insert("so");
trie.insert("see");
trie.buildFailurePointer();
trie.match("herseesohow");
return 0;
}