要学会AC自动机,我们必须知道什么是Trie,也就是字典树。最好对KMP算法也有些了解。Trie树和KMP算法我之前博客都有写过,感兴趣的可以看看。
简单叙述下问题,现在给出
"hsay";
"ah";
"sahe";
"he";
"say";
"herhb";
"aher";
"erhs"
共8个关键词,要问字符串"yasaherhsay"中这8个关键词有几个出现过。
答案是7。
这就是一个多模式匹配问题。
AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。
失败指针和KMP算法中的next函数或称shift函数的功能类似。
上图解释了失败指针的作用。
// AC_automachine.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include<vector>
#include<algorithm>
#include<set>
#include<iostream>
using namespace std;
#define MAXSIZE 26
struct TrieNode
{
TrieNode* next[MAXSIZE];
TrieNode*parent;
vector<TrieNode*>fail;
char p;
int Num;
bool isword;
};
set<string>re;//保存结果
TrieNode*initiate_Trie()
{
TrieNode*root = new TrieNode;
for (int i = 0; i < MAXSIZE; i++)
root->next[i] = NULL;
root->Num = 0;
root->parent = NULL;
root->isword = false;
return root;
}
bool search(TrieNode*root, char*str)
{
TrieNode*tn;
tn = root;
int k;
while (*str != '\0')
{
k = *str - 'a';
if (tn->next[k] == NULL)
return false;
tn = tn->next[k];
str++;
}
if (tn->isword == false)
return false;
return true;
}
TrieNode*build_Trie_singleword(TrieNode*root, char*str)
{
if (search(root, str))
return root;
root->Num = root->Num + 1;
TrieNode*tn;
tn = root;
while (*str != '\0')
{
int k = *str - 'a';
if (tn->next[k] == NULL)
{
tn->next[k] = new TrieNode;
for (int i = 0; i < MAXSIZE; i++)
{
tn->next[k]->next[i] = NULL;
}
tn->next[k]->p = *str;
tn->next[k]->Num = 1;
tn->next[k]->parent = tn;
tn->next[k]->isword = false;
}
else
{
tn->next[k]->Num = tn->next[k]->Num + 1;
}
tn = tn->next[k];
str++;
}
tn->isword = true;
return root;
}
void initiate_fail_pointer(TrieNode*root, TrieNode*node)
{
//if (node == NULL)
// return;
if (node == root)
{
for (int i = 0; i < MAXSIZE; i++)
if (root->next[i] != NULL)
initiate_fail_pointer(root, root->next[i]);
}
else
{
cout << node->p;
TrieNode*n = node;
vector<char>ss;
ss.push_back(node->p);
vector<TrieNode*>::iterator result = find(node->fail.begin(), node->fail.end(), root->next[node->p - 'a']); //查找
if (root->next[node->p - 'a'] != NULL&&result == node->fail.end() && root->next[node->p - 'a'] != node)
node->fail.push_back(root->next[node->p - 'a']);
while (n->parent != root)
{
TrieNode*mm = root;
ss.push_back(n->parent->p);
int i;
for (i = ss.size() - 1; i >= 0; i--)
if (mm->next[ss[i] - 'a'] != NULL)
mm = mm->next[ss[i] - 'a'];
else
break;
if (i == -1 && mm != node)
{
result = find(node->fail.begin(), node->fail.end(), mm);
if (result == node->fail.end())
node->fail.push_back(mm);
}
n = n->parent;
}
for (int i = 0; i < MAXSIZE; i++)
if (node->next[i] != NULL)
initiate_fail_pointer(root, node->next[i]);
}
}
int AC_automachine(TrieNode*root, char*str)
{
int count = 0;
int len = strlen(str);
int k = 0;
while (k < len)
{
while (root->next[str[k] - 'a'] == NULL)
{
k++;
}
TrieNode*p,*node = root->next[str[k] - 'a'];
p = NULL;
while (node != NULL)
{
if (node->isword == true)
{
string aa;
TrieNode*nn = node;
while (nn != root)
{
aa += nn->p;
nn = nn->parent;
}
std::reverse(aa.begin(), aa.end());
if (re.find(aa) == re.end())
{
re.insert(aa);
count++;
}
}
if (!node->fail.empty())
{
for (int i = 0; i < node->fail.size(); i++)
if (node->fail[i]->isword)
{
string aa;
TrieNode*nn = node->fail[i];
while (nn != root)
{
aa += nn->p;
nn = nn->parent;
}
std::reverse(aa.begin(), aa.end());
if (re.find(aa) == re.end())
{
re.insert(aa);
count++;
}
}
}
k++;
p = node;
node = node->next[str[k] - 'a'];
}
k--;
node = p;
_ASSERT(node);
if (node->fail.empty())
{
k++;
}
else
{
int max = 0;
TrieNode*tn, *tp;
tn = NULL;
int kk;
for (int i = 0; i < node->fail.size(); i++)
{
kk = 0;
tp = node->fail[i];
while (tp != NULL)
{
if (tp->isword)
{
string aa;
TrieNode*nn = tp;
while (nn != root)
{
aa += nn->p;
nn = nn->parent;
}
std::reverse(aa.begin(), aa.end());
if (re.find(aa) == re.end())
{
re.insert(aa);
count++;
}
}
if (!tp->fail.empty())
{
for (int i = 0; i < tp->fail.size(); i++)
if (tp->fail[i]->isword)
{
string aa;
TrieNode*nn = tp->fail[i];
while (nn != root)
{
aa += nn->p;
nn = nn->parent;
}
std::reverse(aa.begin(), aa.end());
if (re.find(aa) == re.end())
{
re.insert(aa);
count++;
}
}
}
kk++;
p = tp;
tp = tp->next[str[k + kk] - 'a'];
}
if (kk > max)
{
max = kk;
tn = p;
_ASSERT(tn);
}
}
if (!tn->fail.empty())
{
int maxlen=0;
for (int i = 0; i < tn->fail.size(); i++)
{
TrieNode*mm = tn->fail[i];
int kkk = 0;
while (mm != root)
{
mm = mm->parent;
kkk++;
}
if (kkk > maxlen)
maxlen = kkk;
}
k = k + kk - maxlen;
}
else
{
k = k + kk;
}
}//end of else
}
return count;
}
int _tmain(int argc, _TCHAR* argv[])
{
TrieNode*root = initiate_Trie();
root = build_Trie_singleword(root, "hsay");
root = build_Trie_singleword(root, "ah");
root = build_Trie_singleword(root, "sahe");
root = build_Trie_singleword(root, "he");
root = build_Trie_singleword(root, "say");
root = build_Trie_singleword(root, "herhb");
root = build_Trie_singleword(root, "aher");
root = build_Trie_singleword(root, "erhs");
initiate_fail_pointer(root, root);
cout << endl;
cout << AC_automachine(root, "yasaherhsay") << endl;
system("pause");
return 0;
}