linux工具常用fgrep的源码阅读

最新推荐文章于 2022-06-22 18:54:00 发布

manxi6801

最新推荐文章于 2022-06-22 18:54:00 发布

阅读量197

点赞数 1

分类专栏：技术文章标签： linux gnu grep ac 多模匹配

本文链接：https://blog.csdn.net/manxi6801/article/details/99450928

版权

技术专栏收录该内容

2 篇文章 0 订阅

订阅专栏

源代码下载
gnu软件源代码地址 http://mirrors.ustc.edu.cn/gnu/
解压后编译

./configure
make
阅读源代码，学习gerp的ac算法实现方式

如下是fgerp用到的数据结构。

/* 平衡二叉树 */
struct tree
{
  struct tree *llink;		/*左分支  */
  struct tree *rlink;		/* 右分支.  */
  struct trie *trie;		/* 指向下一个trie节点.  */
  unsigned char label;		/*节点代表的字符  */
  char balance;			/* 平衡因子  */
};
/* trie节点*/
struct trie
{
  size_t accepting;		/*标记是否结束节点， 如果是accepting =  2 * W + 1;*/
                        /* 如果此节点到root的节点的路径包含其他以同样label结束的子串，accepting  = SIZE_MAX ；*/
                        /*如果不是结束节点， accepting  = 0； */

  struct tree *links;		/* 指向下一个平衡2叉树  */
  struct trie *parent;		/* 指向上一个trie节点.  */
  struct trie *next;		/* 指向下一个node节点，沿着此路径会得到一个完整的patter.  */
  struct trie *fail;		/* Aho-Corasick failure 指针.  */
  ptrdiff_t depth;		/* 当前节点的深度，也就是节点的label在字串中的下标 */
  ptrdiff_t shift;		/* Shift function for search failures.  */
  ptrdiff_t maxshift;		/* Max shift of self and descendants.  */
};

这里采用了平衡2叉树的数据结构来实现字典树，和普通的字典树相比，更节省空间。

二叉树节点
字典树节点
tree可以理解为二叉树的node。
trie可以理解为字典树的node。
eg：
现在有patter如下：
aac abc
aad abd
aae abe
组成如下的树形结构
在这里插入图片描述
圆圈是trie；小方块是tree；蓝色的短线是tree的trie指针；黄色的箭头是llink和rlink。图画的很low，别在意。
下面的代码就是构建树的过程，包含一个平衡二叉树的过程。

/* 添加一个patter到树中 */
void kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
{
  assume (0 <= len);
  struct trie *trie = kwset->trie;
  char const *trans = kwset->trans;
  bool reverse = kwset->kwsexec == bmexec;

  /* 循环添加每一个字符节点到树中  */
  while (len--)//遍历字符
    {
      unsigned char uc = reverse ? *--text : *text++;
      unsigned char label = trans ? trans[uc] : uc;

      /* 从当前trie节点的出边向下寻找当前字符，跟踪此路径. */
      struct tree *cur = trie->links;
      struct tree *links[DEPTH_SIZE];
      enum { L, R } dirs[DEPTH_SIZE];
      links[0] = (struct tree *) &trie->links; 
      dirs[0] = L;
      ptrdiff_t depth = 1;
     /*在此trie节点的tree 二叉树中寻找此 当前字符*/
      while (cur && label != cur->label)
        {
          links[depth] = cur;
          if (label < cur->label)
            dirs[depth++] = L, cur = cur->llink;
          else
            dirs[depth++] = R, cur = cur->rlink;
        }

      /* The current character doesn't have an outgoing link at
         this trie node, so build a new trie node and install
         a link in the current trie node's tree.  */
         /* 如果当前二叉树还没有此字符节点，则添加此字符节点到二叉树中 */
      if (!cur)
        {
          cur = obstack_alloc (&kwset->obstack, sizeof *cur);
          cur->llink = NULL;
          cur->rlink = NULL;
          cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
          cur->trie->accepting = 0;
          cur->trie->links = NULL;
          cur->trie->parent = trie;
          cur->trie->next = NULL;
          cur->trie->fail = NULL;
          cur->trie->depth = trie->depth + 1;
          cur->trie->shift = 0;
          cur->label = label;
          cur->balance = 0;

          /* Install the new tree node in its parent.  */
          if (dirs[--depth] == L)
            links[depth]->llink = cur;
          else
            links[depth]->rlink = cur;

          /* Back up the tree fixing the balance flags.  */
          while (depth && !links[depth]->balance)
            {
              if (dirs[depth] == L)
                --links[depth]->balance;
              else
                ++links[depth]->balance;
              --depth;
            }

          /* 修正二叉树.  四种旋转方式，LL； LR； RR； RL*/
          if (depth && ((dirs[depth] == L && --links[depth]->balance)
                        || (dirs[depth] == R && ++links[depth]->balance)))
            {
              struct tree *t, *r, *l, *rl, *lr;

              switch (links[depth]->balance)
                {
                case (char) -2:
                  switch (dirs[depth + 1])
                    {
                    case L:
                      r = links[depth], t = r->llink, rl = t->rlink;
                      t->rlink = r, r->llink = rl;
                      t->balance = r->balance = 0;
                      break;
                    case R:
                      r = links[depth], l = r->llink, t = l->rlink;
                      rl = t->rlink, lr = t->llink;
                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
                      l->balance = t->balance != 1 ? 0 : -1;
                      r->balance = t->balance != (char) -1 ? 0 : 1;
                      t->balance = 0;
                      break;
                    default:
                      abort ();
                    }
                  break;
                case 2:
                  switch (dirs[depth + 1])
                    {
                    case R:
                      l = links[depth], t = l->rlink, lr = t->llink;
                      t->llink = l, l->rlink = lr;
                      t->balance = l->balance = 0;
                      break;
                    case L:
                      l = links[depth], r = l->rlink, t = r->llink;
                      lr = t->llink, rl = t->rlink;
                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
                      l->balance = t->balance != 1 ? 0 : -1;
                      r->balance = t->balance != (char) -1 ? 0 : 1;
                      t->balance = 0;
                      break;
                    default:
                      abort ();
                    }
                  break;
                default:
                  abort ();
                }

              if (dirs[depth - 1] == L)
                links[depth - 1]->llink = t;
              else
                links[depth - 1]->rlink = t;
            }
        }

      trie = cur->trie;
    }

  /* Mark the node finally reached as accepting, encoding the
     index number of this word in the keyword set so far.  */
  if (!trie->accepting)
    {
      size_t words = kwset->words;
      trie->accepting = 2 * words + 1;
    }
  ++kwset->words;

  /* 跟踪记录最长的和最短的模式串长度.  */
  if (trie->depth < kwset->mind)
    kwset->mind = trie->depth;
  if (trie->depth > kwset->maxd)
    kwset->maxd = trie->depth;
}

将所有的字符串都加入到了树中后下一步就是建立失败指针。

/* 采用广度优先遍历trie节点 */
  for (curr = last = kwset->trie; curr; curr = curr->next)
    {
      struct trie * tmp = curr;
      /* 将相同深度的trie节点.压入队列，实现广度优先遍历 */
      /* Enqueue the immediate descendants in the level order queue.  */
      enqueue (curr->links, &last);
      
      /*  更新delta表，保存256个字符最大的深度 */
      /* Update the delta table for the descendants of this node.  */
      treedelta (curr->links, curr->depth, delta);

      /* 计算node节点的后代的失败指针.  */
      treefails (curr->links, curr->fail, kwset->trie, reverse);
}

到这一步，前期的准备工作就做好了。