-
源代码下载
gnu软件源代码地址 http://mirrors.ustc.edu.cn/gnu/ -
解压后编译
./configure
make -
阅读源代码,学习gerp的ac算法实现方式
如下是fgerp用到的数据结构。
/* 平衡二叉树 */
struct tree
{
struct tree *llink; /*左分支 */
struct tree *rlink; /* 右分支. */
struct trie *trie; /* 指向下一个trie节点. */
unsigned char label; /*节点代表的字符 */
char balance; /* 平衡因子 */
};
/* trie节点*/
struct trie
{
size_t accepting; /*标记是否结束节点, 如果是accepting = 2 * W + 1;*/
/* 如果此节点到root的节点的路径包含其他以同样label结束的子串,accepting = SIZE_MAX ;*/
/*如果不是结束节点, accepting = 0; */
struct tree *links; /* 指向下一个平衡2叉树 */
struct trie *parent; /* 指向上一个trie节点. */
struct trie *next; /* 指向下一个node节点,沿着此路径会得到一个完整的patter. */
struct trie *fail; /* Aho-Corasick failure 指针. */
ptrdiff_t depth; /* 当前节点的深度,也就是节点的label在字串中的下标 */
ptrdiff_t shift; /* Shift function for search failures. */
ptrdiff_t maxshift; /* Max shift of self and descendants. */
};
这里采用了平衡2叉树的数据结构来实现字典树,和普通的字典树相比,更节省空间。
tree可以理解为二叉树的node。
trie可以理解为字典树的node。
eg:
现在有patter如下:
aac abc
aad abd
aae abe
组成如下的树形结构
圆圈是trie; 小方块是tree; 蓝色的短线是tree的trie指针; 黄色的箭头是llink和rlink。图画的很low,别在意。
下面的代码就是构建树的过程,包含一个平衡二叉树的过程。
/* 添加一个patter到树中 */
void kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
{
assume (0 <= len);
struct trie *trie = kwset->trie;
char const *trans = kwset->trans;
bool reverse = kwset->kwsexec == bmexec;
/* 循环添加每一个字符节点到树中 */
while (len--)//遍历字符
{
unsigned char uc = reverse ? *--text : *text++;
unsigned char label = trans ? trans[uc] : uc;
/* 从当前trie节点的出边向下寻找当前字符,跟踪此路径. */
struct tree *cur = trie->links;
struct tree *links[DEPTH_SIZE];
enum { L, R } dirs[DEPTH_SIZE];
links[0] = (struct tree *) &trie->links;
dirs[0] = L;
ptrdiff_t depth = 1;
/*在此trie节点的tree 二叉树中寻找此 当前字符*/
while (cur && label != cur->label)
{
links[depth] = cur;
if (label < cur->label)
dirs[depth++] = L, cur = cur->llink;
else
dirs[depth++] = R, cur = cur->rlink;
}
/* The current character doesn't have an outgoing link at
this trie node, so build a new trie node and install
a link in the current trie node's tree. */
/* 如果当前二叉树还没有此字符节点,则添加此字符节点到二叉树中 */
if (!cur)
{
cur = obstack_alloc (&kwset->obstack, sizeof *cur);
cur->llink = NULL;
cur->rlink = NULL;
cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
cur->trie->accepting = 0;
cur->trie->links = NULL;
cur->trie->parent = trie;
cur->trie->next = NULL;
cur->trie->fail = NULL;
cur->trie->depth = trie->depth + 1;
cur->trie->shift = 0;
cur->label = label;
cur->balance = 0;
/* Install the new tree node in its parent. */
if (dirs[--depth] == L)
links[depth]->llink = cur;
else
links[depth]->rlink = cur;
/* Back up the tree fixing the balance flags. */
while (depth && !links[depth]->balance)
{
if (dirs[depth] == L)
--links[depth]->balance;
else
++links[depth]->balance;
--depth;
}
/* 修正二叉树. 四种旋转方式,LL; LR; RR; RL*/
if (depth && ((dirs[depth] == L && --links[depth]->balance)
|| (dirs[depth] == R && ++links[depth]->balance)))
{
struct tree *t, *r, *l, *rl, *lr;
switch (links[depth]->balance)
{
case (char) -2:
switch (dirs[depth + 1])
{
case L:
r = links[depth], t = r->llink, rl = t->rlink;
t->rlink = r, r->llink = rl;
t->balance = r->balance = 0;
break;
case R:
r = links[depth], l = r->llink, t = l->rlink;
rl = t->rlink, lr = t->llink;
t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
l->balance = t->balance != 1 ? 0 : -1;
r->balance = t->balance != (char) -1 ? 0 : 1;
t->balance = 0;
break;
default:
abort ();
}
break;
case 2:
switch (dirs[depth + 1])
{
case R:
l = links[depth], t = l->rlink, lr = t->llink;
t->llink = l, l->rlink = lr;
t->balance = l->balance = 0;
break;
case L:
l = links[depth], r = l->rlink, t = r->llink;
lr = t->llink, rl = t->rlink;
t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
l->balance = t->balance != 1 ? 0 : -1;
r->balance = t->balance != (char) -1 ? 0 : 1;
t->balance = 0;
break;
default:
abort ();
}
break;
default:
abort ();
}
if (dirs[depth - 1] == L)
links[depth - 1]->llink = t;
else
links[depth - 1]->rlink = t;
}
}
trie = cur->trie;
}
/* Mark the node finally reached as accepting, encoding the
index number of this word in the keyword set so far. */
if (!trie->accepting)
{
size_t words = kwset->words;
trie->accepting = 2 * words + 1;
}
++kwset->words;
/* 跟踪记录最长的和最短的模式串长度. */
if (trie->depth < kwset->mind)
kwset->mind = trie->depth;
if (trie->depth > kwset->maxd)
kwset->maxd = trie->depth;
}
将所有的字符串都加入到了树中后下一步就是建立失败指针。
/* 采用广度优先遍历trie节点 */
for (curr = last = kwset->trie; curr; curr = curr->next)
{
struct trie * tmp = curr;
/* 将相同深度的trie节点.压入队列,实现广度优先遍历 */
/* Enqueue the immediate descendants in the level order queue. */
enqueue (curr->links, &last);
/* 更新delta表,保存256个字符最大的深度 */
/* Update the delta table for the descendants of this node. */
treedelta (curr->links, curr->depth, delta);
/* 计算node节点的后代的失败指针. */
treefails (curr->links, curr->fail, kwset->trie, reverse);
}
到这一步,前期的准备工作就做好了。