开源AC算法链接
:
https://sourceforge.net/projects/multifast/files/
1. 一个开源AC算法源码分析: http://blog.csdn.net/WJ_1062/article/details/48751951
2. 从头到尾彻底理解KMP: http://blog.csdn.net/v_july_v/article/details/7041827
本文中采用的版本是multifast-v1.4.2。
/*
* example1.c: This program illustrates how to use ahocorasick library
* it shows how to use the search interface to find patterns
* This file is part of multifast.
*
Copyright 2010-2013 Kamiar Kanani <kamiar.kanani@gmail.com>
multifast is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
multifast is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with multifast. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <string.h>
#include "ahocorasick.h"
AC_ALPHABET_t * sample_patterns[] = {
"taobao",
"youku",
"weixin",
"weibo",
"iqiyi",
"baidu"
};
#define PATTERN_NUMBER (sizeof(sample_patterns)/sizeof(AC_ALPHABET_t *))
AC_ALPHABET_t * input_text1 = {"find.baidu.com/123"};
AC_ALPHABET_t * input_text2 = {"buy.taobao.com"};
AC_ALPHABET_t * input_text3 = {"video.youku.cn"};
// 1. Define a call-back function of AC_MATCH_CALBACK_t
int match_handler (AC_MATCH_t * matchp, void * param)
{
unsigned int j;
// in this example we don't use param
printf ("@ %2ld: ", matchp->position);
for (j=0; j < matchp->match_num; j++)
printf ("#%ld (%s), ", matchp->patterns[j].rep.number, matchp->patterns[j].astring);
// CAUTION: be careful about using m->matched_patterns[j].astring
// if 'astring' has permanent allocation inside your program's
// memory area, you can use it. otherwise it will point to
// an incorrect memory place.
printf ("\n");
return 0;
// return 0 : continue searching
// return none zero : stop searching
// as soon as you get enough from search results, you can stop search and
// return from ac_automata_search() and continue the rest of your program.
// e.g. if you only need first N matches, define a counter and return none
// zero after the counter exceeds N.
// to find all matches always return 0
}
int main (int argc, char ** argv)
{
unsigned int i;
// 2. Define AC variables
AC_AUTOMATA_t *atm;
AC_PATTERN_t tmp_patt;
AC_TEXT_t tmp_text;
// 3. Get a new automata
atm = ac_automata_init ();
// 4. Add patterns to automata
for (i=0; i<PATTERN_NUMBER; i++)
{
tmp_patt.astring = sample_patterns[i];
tmp_patt.rep.number = i+1; // optional
tmp_patt.length = strlen (tmp_patt.astring);
ac_automata_add (atm, &tmp_patt);
}
// 5. Finalize automata.
ac_automata_finalize (atm);
// after you have finished with adding patterns you must finalize the automata
// from now you can not add patterns anymore.
// 5.1. Display automata
//ac_automata_display (atm, 'n');
// the second argument determines the cast type of the pattern representative.
// 'n': as number
// 's': as string
// because we use the integer part of union (tmp_patt.rep.number) so we used 'n'
printf ("Searching: \"%s\"\n", input_text1);
// 6. Set input text
tmp_text.astring = input_text1;
tmp_text.length = strlen (tmp_text.astring);
// 7. Do search
ac_automata_search (atm, &tmp_text, 0, match_handler, 0);
// the 5th option is a (void *), and it will be forwarded to the callback
// function. you can pass everything you want to the callback function
// using this argument.
// in this example we don't send a parameter to callback function.
// a typical practice is to define a struct that encloses whatever you want
// to send the callback function, including input and output variables
printf ("Searching: \"%s\"\n", input_text2);
// do another search
tmp_text.astring = input_text2;
tmp_text.length = strlen (tmp_text.astring);
ac_automata_search (atm, &tmp_text, 0, match_handler, 0);
printf ("Searching: \"%s\" with \'keep\' enabled\n", input_text3);
// and another
tmp_text.astring = input_text3;
tmp_text.length = strlen (tmp_text.astring);
ac_automata_search (atm, &tmp_text, 1, match_handler, 0);
// when the keep option (3rd argument) in set, then the automata
// considers that the given text is the next chunk of the previous text.
// to understand the difference try it with 0 and 1 and compare the result
// 8. Release automata
ac_automata_release (atm);
// do not forget to release the automata after you have done with it
return 0;
}
1. 步骤一、初始化amt
AC_AUTOMATA_t * ac_automata_init ()
{
AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)malloc(sizeof(AC_AUTOMATA_t)); /*分配AC_AUTOMATA_t结构大小的内存给thiz*/
memset (thiz, 0, sizeof(AC_AUTOMATA_t));/*初始化内存空间为0*/
thiz->root = node_create (); /*创建根节点*/
thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES; /*最大节点长度为200 #define REALLOC_CHUNK_ALLNODES 200*/
thiz->all_nodes = (AC_NODE_t **) malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *)); /*为所有200个字节点分配内存大小*/
ac_automata_register_nodeptr (thiz, thiz->root); /*将根节点放入all_node中*/
ac_automata_reset (thiz);/*更新thiz的当前节点为根节点,base_position为0*/
thiz->total_patterns = 0;
thiz->automata_open = 1;/*标记自动机可以添加模式串(open = 1)*/
return thiz;
}
2. 步骤二---将对应的模式字符串加入到tire树中
/******************************************************************************
* FUNCTION: ac_automata_add
* Adds pattern to the automata.
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
* AC_PATTERN_t * patt: the pointer to added pattern
* RETUERN VALUE: AC_ERROR_t
* the return value indicates the success or failure of adding action
******************************************************************************/
/*这个函数实际上是在创建tire树*/
AC_STATUS_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt)
{
unsigned int i;
AC_NODE_t * n = thiz->root;
AC_NODE_t * next;
AC_ALPHABET_t alpha;
if(!thiz->automata_open) /*不为可添加模式,直接返回; 其实已在ac_automata_init里面设置为1了*/
return ACERR_AUTOMATA_CLOSED;
if (!patt->length) /*待添加模式的字符串长度为0,则直接返回*/
return ACERR_ZERO_PATTERN;
if (patt->length > AC_PATTRN_MAX_LENGTH) /*待添加模式的字符串长度大于最大值1024(#define AC_PATTRN_MAX_LENGTH 1024),则直接返回*/
return ACERR_LONG_PATTERN;
for (i=0; i<patt->length; i++)
{
alpha = patt->astring[i]; /*循环取出带添加字符串的字符*/
/*沿着树节点的出度(边)查找是否有符合指定alpha的边,
*找到则返回沿这个边找到的下一个节点
*没有找到,则为alpha创建一个新的节点*/
if ((next = node_find_next(n, alpha)))
{
n = next;
continue;
}
else
{
next = node_create_next(n, alpha); /*创建新节点,注册出度,
*即n->outgoing[n->outgoing_degree].alpha = alpha,n->outgoing[n->outgoing_degree++].next = next*/
next->depth = n->depth + 1; /*更新新节点的深度*/
n = next; /*更新节点*/
ac_automata_register_nodeptr(thiz, n);/*将新节点加入到AC_AUTOMATA_t结构的all_nodes中,形成一个大的字符node数组*/
}
}
if(n->final) /*如果是最后一个节点*/
return ACERR_DUPLICATE_PATTERN;
n->final = 1;/*设置为最后一个节点*/
node_register_matchstr(n, patt); /*在最后一个字符节点设置整个字符串模式的值*/
thiz->total_patterns++;
return ACERR_SUCCESS;
}
最终形成的树图如下:
此时,所以节点的failure_node都为NULL。
3.步骤三---为树上的每一个节点设置失败跳转节点
/******************************************************************************
* FUNCTION: ac_automata_set_failure
* find failure node for the given node.
******************************************************************************/
static void ac_automata_set_failure
(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
{
unsigned int i, j;
AC_NODE_t * m;
/*假设alphas[1] = t,alpha[2]=a,alpha[3]=o
depth = 4
则表示从树根开始分别搜索tao,ao,o这三组字符串,如果有匹配,则m不为NULL,否则m为NULL*/
for (i=1; i < node->depth; i++)
{
m = thiz->root;
for (j=i; j < node->depth && m; j++)
m = node_find_next (m, alphas[j]); /*在节点m的出度上查找字符alphas[i]*/
if (m) /*找到匹配的字符,则将本节点的失败节点设置为节点m*/
{
node->failure_node = m;
break;
}
}
/*如果没有找到任何节点,则将根节点赋值为失败节点*/
if (!node->failure_node)
node->failure_node = thiz->root;
}
/******************************************************************************
* FUNCTION: ac_automata_traverse_setfailure
* Traverse all automata nodes using DFS (Depth First Search), meanwhile it set
* the failure node for every node it passes through. this function must be
* called after adding last pattern to automata. i.e. after calling this you
* can not add further pattern to automata.
******************************************************************************/
static void ac_automata_traverse_setfailure
(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
{
unsigned int i;
AC_NODE_t * next;
/*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/
for (i=0; i < node->outgoing_degree; i++)
{
alphas[node->depth] = node->outgoing[i].alpha;
next = node->outgoing[i].next;
/* At every node look for its failure node */
ac_automata_set_failure (thiz, next, alphas);/*为每个节点设置failure节点指针*/
/* Recursively call itself to traverse all nodes */
ac_automata_traverse_setfailure (thiz, next, alphas);
}
}
/******************************************************************************
* FUNCTION: ac_automata_finalize
* Locate the failure node for all nodes and collect all matched pattern for
* every node. it also sorts outgoing edges of node, so binary search could be
* performed on them. after calling this function the automate literally will
* be finalized and you can not add new patterns to the automate.
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
******************************************************************************/
void ac_automata_finalize (AC_AUTOMATA_t * thiz)
{
unsigned int i;
AC_ALPHABET_t alphas[AC_PATTRN_MAX_LENGTH]; /*AC_PATTRN_MAX_LENGTH 1024*/
AC_NODE_t * node;
/*通过深度优先列遍所有tire树节点,为每个节点设置failure节点指针*/
ac_automata_traverse_setfailure (thiz, thiz->root, alphas);
for (i=0; i < thiz->all_nodes_num; i++)
{
node = thiz->all_nodes[i];
ac_automata_union_matchstrs (node);
node_sort_edges (node);/*对于一个节点下的所有outgoing出度进行排序*/
}
thiz->automata_open = 0; /* do not accept patterns any more */
}
步骤4---查找
/******************************************************************************
* FUNCTION: node_findbs_next
* Find out the next node for a given Alpha. this function is used after the
* pre-processing stage in which we sort edges. so it uses Binary Search.
******************************************************************************/
AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha)
{
int min, max, mid;
AC_ALPHABET_t amid;
min = 0;
max = thiz->outgoing_degree - 1;
while (min <= max)
{
mid = (min+max) >> 1;
amid = thiz->outgoing[mid].alpha;
if (alpha > amid)
min = mid + 1;
else if (alpha < amid)
max = mid - 1;
else
return (thiz->outgoing[mid].next);
}
return NULL;
}
/******************************************************************************
* FUNCTION: ac_automata_search
* Search in the input text using the given automata. on match event it will
* call the call-back function. and the call-back function in turn after doing
* its job, will return an integer value to ac_automata_search(). 0 value means
* continue search, and non-0 value means stop search and return to the caller.
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
* AC_TEXT_t * txt: the input text that must be searched
* int keep: is the input text the successive chunk of the previous given text
* void * param: this parameter will be send to call-back function. it is
* useful for sending parameter to call-back function from caller function.
* RETURN VALUE:
* -1: failed; automata is not finalized
* 0: success; input text was searched to the end
* 1: success; input text was searched partially. (callback broke the loop)
******************************************************************************/
int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep,
AC_MATCH_CALBACK_f callback, void * param)
{
unsigned long position;
AC_NODE_t * current;
AC_NODE_t * next;
AC_MATCH_t match;
if (thiz->automata_open)
/* you must call ac_automata_locate_failure() first */
return -1;
thiz->text = 0;
if (!keep)
ac_automata_reset(thiz); /*将current_node重置为root*/
position = 0;
current = thiz->current_node;
/* This is the main search loop.
* it must be as lightweight as possible. */
while (position < text->length)
{
/*在节点current的outgoing边上二分查找,如果找到则current=next,position++; 如果没有找到,有faliure节点,则current为failure节点,没有faliure节点则直接position++*/
if (!(next = node_findbs_next(current, text->astring[position])))
{
if(current->failure_node /* we are not in the root node */)
current = current->failure_node;
else
position++;
}
else
{
current = next;
position++;
}
/*找到叶子节点,如果是没有failure节点的情况下,此时current==next*/
if (current->final && next)
/* We check 'next' to find out if we came here after a alphabet
* transition or due to a fail. in second case we should not report
* matching because it was reported in previous node */
{
/*将匹配信息记录到match结构中*/
match.position = position + thiz->base_position;
match.match_num = current->matched_patterns_num;
match.patterns = current->matched_patterns;
/* we found a match! do call-back */
if (callback(&match, param)) /*调用回调函数进行匹配时的后续操作*/
return 1;
}
}
/* save status variables */
thiz->current_node = current; /*记录本次结束时的current_node,以便keep参数为1时使用*/
thiz->base_position += position;
return 0;
}
1. 一个开源AC算法源码分析: http://blog.csdn.net/WJ_1062/article/details/48751951
2. 从头到尾彻底理解KMP: http://blog.csdn.net/v_july_v/article/details/7041827