高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒
匹配消耗时间:6599 毫秒

代码和测试案例:

package com.baian.tggroupmessagematchkeyword.ac;

import lombok.Data;

import java.util.*;

/**
 * @program: tg-parent
 * @description: ac
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 17:20
 **/
@Data
public class AhoCorasick {
    private TrieNode root;

    public AhoCorasick() {
        root = new TrieNode();
    }

    public void addKeyword(String keyword) {
        TrieNode current = root;

        for (char ch : keyword.toCharArray()) {
            current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
        }

        current.setEndOfWord(true);
        current.addKeyword(keyword);
    }

    public void buildFailureLinks() {
        Queue<TrieNode> queue = new LinkedList<>();
        root.setFailure(null);
        queue.offer(root);

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();

            for (TrieNode child : current.getChildren().values()) {
                TrieNode failure = current.getFailure();

                while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
                    failure = failure.getFailure();
                }

                if (failure == null) {
                    child.setFailure(root);
                } else {
                    child.setFailure(failure.getChildren().get(child.getKey()));
                    child.addAllKeywords(child.getFailure().getKeywords());
                }

                queue.offer(child);
            }
        }
    }

    public List<String> searchKeywords(String text) {
        List<String> result = new ArrayList<>();
        TrieNode current = root;

        for (int i = 0; i < text.length(); i++) {
            char ch = text.charAt(i);

            while (current != null && !current.getChildren().containsKey(ch)) {
                current = current.getFailure();
            }

            if (current == null) {
                current = root;
            } else {
                current = current.getChildren().get(ch);
                if (current.isEndOfWord()) {
                    result.addAll(current.getKeywords());
                }

                TrieNode failure = current.getFailure();
                while (failure != null) {
                    if (failure.isEndOfWord()) {
                        result.addAll(failure.getKeywords());
                    }
                    failure = failure.getFailure();
                }
            }
        }

        return result;
    }

    public static class TrieNode {
        private char key;
        private boolean endOfWord;
        private TrieNode failure;
        private Map<Character, TrieNode> children;
        private List<String> keywords;

        public TrieNode() {
            children = new HashMap<>();
            keywords = new ArrayList<>();
        }

        public char getKey() {
            return key;
        }

        public void setKey(char key) {
            this.key = key;
        }

        public boolean isEndOfWord() {
            return endOfWord;
        }

        public void setEndOfWord(boolean endOfWord) {
            this.endOfWord = endOfWord;
        }

        public TrieNode getFailure() {
            return failure;
        }

        public void setFailure(TrieNode failure) {
            this.failure = failure;
        }

        public Map<Character, TrieNode> getChildren() {
            return children;
        }

        public List<String> getKeywords() {
            return keywords;
        }

        public void addKeyword(String keyword) {
            keywords.add(keyword);
        }

        public void addAllKeywords(List<String> keywords) {
            this.keywords.addAll(keywords);
        }
    }
}

main:

package test;

import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @program: tg-parent
 * @description: 多样本数据集 测试。
 * @author: <发哥讲Java-694204477@qq.com>
 * @create: 2023-09-19 14:11
 **/
public class TestMain001 {
    public static void main(String[] args) {
        long start0 = System.currentTimeMillis();
        List<String> datas = new ArrayList<>(1000000);
        for (int i = 0; i < 1000000; i++) {
            datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
        }

        AhoCorasick ahoCorasick2 = new AhoCorasick();
        for (int i = 0; i < 1000000; i++) {
            ahoCorasick2.addKeyword(UUID.randomUUID().toString());
        }
        ahoCorasick2.addKeyword("11");
        ahoCorasick2.addKeyword("22");
        ahoCorasick2.buildFailureLinks();
        long end0 = System.currentTimeMillis();
        System.out.println("装载消耗时间:" + (end0 - start0));

        long start = System.currentTimeMillis();
        for (String message : datas) {
            List<String> stringList = ahoCorasick2.searchKeywords(message);
            if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());
            }
        }

        long end = System.currentTimeMillis();
        System.out.println("消耗时间:" + (end - start));

    }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

航迹者

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值