基于字典树实现AC自动机实现内容关键词检索

 如何使用

private ACTrie ctgACTrie = new ACTrie(); //创建自动机

ctgACTrie.addKeyword("你好"); //丢入关键词

private String passThoughCTG(String name) {
  Collection<MatchInfo> emits = ctgACTrie.search(name);
  if (emits.isEmpty())
    return null;
  MatchInfo m = null;
  for (MatchInfo emit : emits) {
    if (m == null || emit.getEnd()-emit.getStart() > m.getEnd()-m.getStart()) {
      m = emit;
    }
  }
  return m.getKeyword().toString()
}


/*返回 你好*/
passThoughCTG("啊啊啊你好啊啊啊"); //获取这句话有没有关键字

创建自动机类

package com.xdf.udf.util;

import java.util.*;
import lombok.Data;

/**
 * @version V1.0
 * @ClassName ACTrie
 * @Description 基于字典树实现AC自动机
 * @Author DFT
 * @Date 2020/5/20 0020
 * @see "https://blog.csdn.net/qq_44011386/article/details/117958782"
 */
public class ACTrie {
  private boolean failureSetted = false; //是否建立了failure表
  private Node root; //根结点

  public ACTrie() {
    this.root = new Node(true);
  }

  /**
   * @Description 添加一组模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [sequences]
   * @return void
   **/
  public void addKeywordList(Collection<? extends CharSequence> sequences){
    if (sequences == null || sequences.isEmpty()) return;
    for (CharSequence sequence : sequences) {
      addKeyword(sequence);
    }
  }

  /**
   * @Description 添加一个模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [cs]
   * @return void
   **/
  public void addKeyword(CharSequence cs) {
    if (cs == null || cs.length() == 0) return;
    //  从根节点开始
    Node currentState = this.root;
    int len = cs.length();
    for (int i = 0; i < len; i++) {
      // 根据字符添加子节点并返回
      currentState = currentState.insert(cs.charAt(i));
    }
    // 将完整字符串添加到最后一个节点上
    currentState.addMatchInfo(cs);
  }

  /**
   * @Description  删除一个模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [cs]
   * @return void
   **/
  public void deleteKeyword(CharSequence cs){
    if (cs == null || cs.length() == 0) return;
    //  从根节点开始
    Node currentState = this.root;
    Node parent = this.root;
    int count = 0;
    int len = cs.length();
    for (int i = 0; i < len; i++) {
      currentState = currentState.childAt(cs.charAt(i));
      if(currentState==null) return;
      if(i==len-1) {
        if(!currentState.children().isEmpty()) return;
      } else if(currentState.children().size()>1 || (currentState.emit()!=null && !currentState.emit().isEmpty())) {
        parent = currentState;
        count = i + 1;
      }
    }
    parent.map.remove(cs.charAt(count));
  }

  /**
   * @Description 匹配模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [text]
   * @return java.util.Collection<com.dft.ACTrie.MatchInfo>
   **/
  public Collection<MatchInfo> search(String text) {
    if (!this.failureSetted) setFailNode();
    Node currentState = this.root;
    List<MatchInfo> matchInfos = new ArrayList<MatchInfo>();
    int len = text.length();
    for (int position = 0; position < len; position++) {
      Character character = text.charAt(position);
      currentState = currentState.nextNode(character);
      Collection<CharSequence> emits = currentState.emit();
      if (emits == null || emits.isEmpty()) {
        continue;
      }
      for (CharSequence emit : emits) {
        matchInfos.add(new MatchInfo(position - emit.length() + 1, position, emit));
      }
    }
    return matchInfos;
  }

  /**
   * @Description 判断是否存在匹配的字符串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [text]
   * @return boolean
   **/
  public boolean findAnyIn(String text){
    if (!this.failureSetted) setFailNode();
    boolean result = false;
    Node currentState = this.root;
    int len = text.length();
    for (int position = 0; position < len; position++) {
      Character c = text.charAt(position);
      currentState = currentState.nextNode(c);
      Collection<CharSequence> emits = currentState.emit();
      if (emits == null || emits.isEmpty()) {
        continue;
      }
      result = true;
    }
    return result;
  }

  /**
   * @Description 设置失败节点
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param []
   * @return void
   **/
  private void setFailNode() {
    //  创建一个队列
    Queue<Node> queue = new LinkedList<Node>();
    // 1.根节点的所有子节点失败节点都是根节点
    Collection<Node> rootChildren = this.root.children();
    for (Node rootChild : rootChildren) {
      // 设置失败节点为根节点
      rootChild.setFailure(this.root);
      // 将节点加入队列用于后续递归
      queue.add(rootChild);
    }
    // 使用广度优先搜索BFS,层次遍历节点来处理,每一个节点的失败路径
    while (!queue.isEmpty()) {
      // 从队列中取出一个节点作为父节点
      Node parentNode = queue.poll();
      // 获取该节点的所有子节点
      Collection<Node> children = parentNode.children();
      for (Node child : children) {
        queue.add(child);
        // 失败节点=父节点的失败节点的next节点
        Node failNode = parentNode.getFailure().nextNode(child.value);
        child.setFailure(failNode);
        child.addMatchInfo(failNode.emit());
      }
    }
    this.failureSetted = true;
  }

  private static class Node {
    private static final char EMPTY = '\0';
    private boolean isRoot = false;//是否为根结点
    private Map<Character, Node> map;//  子节点map
    private char value;// 节点的值
    private Node failure; // 失败节点
    private List<CharSequence> emits; // 输出

    public Node(char value) {
      this.value = value;
      map = new HashMap<Character, Node>();
      emits = new ArrayList<CharSequence>();
    }

    /**
     * @Description 通过带参数构造器创建根节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [isRoot]
     * @return
     **/
    public Node(boolean isRoot) {
      this(EMPTY);
      this.isRoot = isRoot;
    }

    /**
     * @Description 根据字符添加子节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [character]
     * @return com.dft.ACTrie.Node
     **/
    public Node insert(Character character) {
      //  先判断当前节点中是否包含目标字符的子节点
      Node node = this.map.get(character);
      if (node == null) {
        // 如果没有 创建一个新的节点
        node = new Node(character);
        // 添加到当前节点的map中
        map.put(character, node);
      }
      // 返回节点
      return node;
    }

    /**
     * @Description 根据给定字符获取子节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [character]
     * @return com.dft.ACTrie.Node
     **/
    public Node childAt(Character character) {
      return map.get(character);
    }

    /**
     * @Description 根据给定字符跳转到下一个节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [transition]
     * @return com.dft.ACTrie.Node
     **/
    private Node nextNode(Character c) {
      // 在子节点中获取next节点
      Node next = this.childAt(c);
      if (next != null) {
        return next;
      }
      //如果跳转到根结点还是失败,则返回根结点
      if (this.isRoot) {
        return this;
      }
      // 按失败节点递归
      return this.failure.nextNode(c);
    }

    public void addMatchInfo(CharSequence cs) {
      emits.add(cs);
    }

    public void addMatchInfo(Collection<CharSequence> keywords) {
      emits.addAll(keywords);
    }

    public Collection<Node> children() {
      return this.map.values();
    }

    public void setFailure(Node node) {
      failure = node;
    }

    public Node getFailure() {
      return failure;
    }

    public Collection<CharSequence> emit() {
      return this.emits == null ? Collections.<CharSequence>emptyList() : this.emits;
    }
  }

  @Data
  public static class MatchInfo {
    private final CharSequence keyword;// 匹配到的模式串
    private final int start;
    private final int end;

    /**
     * 模式匹配结果
     */
    public MatchInfo(final int start, final int end, final CharSequence keyword) {
      this.start = start;
      this.end = end;
      this.keyword = keyword;
    }

  }

  // demo
  public static void main(String[] args) {
    List<String> keywords = Arrays.asList("coxquery#@{","coxquery#@config#@{","coxqueryhealth#@{","agent#@{","agent#@config#@{","agenthealth#@{");
    ACTrie trie = new ACTrie();
    trie.addKeyword("ctg#@{");
    trie.addKeyword("ctg#@config#@{");
    trie.addKeyword("ctghealth#@{");
    trie.addKeyword("cox#@{");
    trie.addKeyword("cox#@config#@{");
    trie.addKeyword("coxhealth#@{");
    trie.addKeywordList(keywords);

    trie.deleteKeyword("coxhealth#@{");
    System.out.println(trie.findAnyIn("#@monitor#@dataquery#@coxhealth#@{"));
    System.out.println(trie.findAnyIn("#@monitor#@dataquery#@cox#@config#@{"));
    Collection<MatchInfo> emits = trie.search("#@monitor#@dataquery#@coxquery#@config#@{");
    for (MatchInfo emit : emits) {
      System.out.println(emit);
    }
  }

}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是使用Java代码实现双数组字典树(Double Array Trie)的AC自动机: ```java import java.util.*; class ACNode { public int[] children; // 子节点指针 public int parent; // 父节点指针 public boolean isWordEnd; // 是否是单词结尾 public char character; // 字符 public int failureLink; // 失败指针 public List<Integer> output; // 输出 public ACNode(int parent, char character) { children = new int[26]; Arrays.fill(children, -1); this.parent = parent; this.character = character; isWordEnd = false; failureLink = -1; output = new ArrayList<>(); } } class ACTrie { private List<ACNode> trie; public ACTrie() { trie = new ArrayList<>(); trie.add(new ACNode(-1, (char) 0)); } public void insert(String word) { int node = 0; for (char c : word.toCharArray()) { int index = c - 'a'; if (trie.get(node).children[index] == -1) { trie.add(new ACNode(node, c)); trie.get(node).children[index] = trie.size() - 1; } node = trie.get(node).children[index]; } trie.get(node).isWordEnd = true; } public void buildFailureLinks() { Queue<Integer> queue = new LinkedList<>(); // 根节点的失败指针为根节点本身 trie.get(0).failureLink = 0; // 初始化第一层节点的失败指针为根节点 for (int child : trie.get(0).children) { if (child != -1) { trie.get(child).failureLink = 0; queue.add(child); } } // 广度优先遍历构建失败指针 while (!queue.isEmpty()) { int currentNode = queue.poll(); for (int i = 0; i < 26; i++) { int child = trie.get(currentNode).children[i]; if (child != -1) { queue.add(child); int failure = trie.get(currentNode).failureLink; while (trie.get(failure).children[i] == -1 && failure != 0) { failure = trie.get(failure).failureLink; } if (trie.get(failure).children[i] != -1) { failure = trie.get(failure).children[i]; } trie.get(child).failureLink = failure; // 将失败节点的输出添加到当前节点的输出中 trie.get(child).output.addAll(trie.get(failure).output); } } } } public List<Integer> search(String text) { List<Integer> result = new ArrayList<>(); int node = 0; for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); int index = c - 'a'; while (trie.get(node).children[index] == -1 && node != 0) { node = trie.get(node).failureLink; } if (trie.get(node).children[index] != -1) { node = trie.get(node).children[index]; } result.addAll(trie.get(node).output); if (trie.get(node).isWordEnd) { result.add(i); } } return result; } } public class Main { public static void main(String[] args) { ACTrie acTrie = new ACTrie(); // 添加模式串 acTrie.insert("he"); acTrie.insert("she"); acTrie.insert("his"); acTrie.insert("hers"); // 构建失败指针 acTrie.buildFailureLinks(); // 搜索文本 String text = "ushers"; List<Integer> result = acTrie.search(text); // 输出匹配位置 for (int position : result) { System.out.println("Pattern found at index: " + (position - text.length() + 1)); } } } ``` 此代码实现了双数组字典树(Double Array Trie)的AC自动机,用于在给定文本中搜索多个模式串的出现位置。在示例代码中,我们添加了模式串"he"、"she"、"his"和"hers",并搜索文本"ushers",输出匹配的位置。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值