如果在一个字符串中查找子串,实现方法比较简单,比如C库函数strstr()。如果从字符串中查找多个子串,就需要使用多模式匹配方法。AC多模式匹配用于解决多字符串匹配问题,介绍原理的文章很多,不过一般的示例程序只支持匹配26个小写字母,实际项目中,目标串往往带有各种符号及中文,构造的trie树需要包含所有的可见字符(0x20—0xff)。
下面是C++实现代码:
#include <cstdlib>
#include <set>
#include <string>
#include <vector>
#include <queue>
#include <iostream>
using namespace std;
//可显示字符,从0x20--0xFF
#define VISIBLE_NUMBER 224
struct StateNode
{
bool finish_{ false };
int state_{ 0 };
int num_{0};
string pattern_{};
//goto table
vector<StateNode *> transition_table_{ vector<StateNode *>(VISIBLE_NUMBER) };
};
class ACSM
{
private:
StateNode *start_node_;
int state_count_;
vector<StateNode *> corresponding_node_;
vector<StateNode *> fail_;
public:
ACSM() :start_node_{ new StateNode() }, state_count_{ 0 }
{
//state0 is start_node_
corresponding_node_.push_back(start_node_);
}
//read all patterns and produce the goto table
void load_pattern(const vector<string> &_Patterns)
{
int latest_state = 1;
int count = 0;
for (const auto &pattern : _Patterns)
{
auto *p = start_node_;
for (int i = 0; i < pattern.size(); ++i)
{
auto *next_node = p->transition_table_[pattern[i] - 0x20];
if (next_node == nullptr)
{
next_node = new StateNode();
}
if (next_node->state_ == 0)
{
next_node->state_ = latest_state++;
//update the table
corresponding_node_.push_back(next_node);
}
//the goto table
p->transition_table_[pattern[i] - 0x20] = next_node;
p = next_node;
}
p->finish_ = true;
p->pattern_ = pattern;
p->num_ = count++;
}
for (int i = 0; i < VISIBLE_NUMBER; ++i)
{
if (start_node_->transition_table_[i] == nullptr)
{
start_node_->transition_table_[i] = start_node_;
}
}
state_count_ = latest_state;
}
//produce fail function
void dispose()
{
queue<StateNode *> q;
fail_ = std::move(vector<StateNode *>(state_count_));
for (const auto nxt : start_node_->transition_table_)
{
//d=1,f=0
if (nxt->state_ != 0)
{
fail_[nxt->state_] = start_node_;
q.push(nxt);
}
}
//calculate all fail redirection
while (!q.empty())
{
auto known = q.front();
q.pop();
for (int i = 0; i < VISIBLE_NUMBER; ++i)
{
auto nxt = known->transition_table_[i];
if (nxt && nxt->state_ != 0)
{
auto p = fail_[known->state_];
while (!p->transition_table_[i])
{
p = fail_[p->state_];
}
fail_[nxt->state_] = p->transition_table_[i];
q.push(nxt);
}
}
}
}
//search matching
void match(const string &_Str, set<int> &matched)
{
int trans = 0;
auto p = start_node_;
for (int i = 0; i < _Str.size(); ++i)
{
trans = (_Str[i]&0xff) - 0x20;
p = p->transition_table_[trans] ? p->transition_table_[trans] : (--i, fail_[p->state_]);
if (p->finish_)
{
printf("pattern=%s, num=%d\n", p->pattern_.c_str(), p->num_);
matched.insert(p->num_);
}
}
}
};
int main()
{
ACSM acsm;
vector<string> patterns{"has join", "has disconnected", "Licenses=\"1\""};
set<int> matched;
acsm.load_pattern(patterns);
acsm.dispose();
string str="<134>Jan 19 02:25:14 Message=\"Participant has joined.\" Name=\"土木工程\"Licenses=\"1\"";
acsm.match(str, matched);
for (auto m: matched)
cout << m<< endl;
//system("pause");
return 0;
}