java主副关键词组匹配升级高性能版

Hello.Reader

已于 2024-01-23 19:30:42 修改

阅读量434

点赞数 8

分类专栏：大数据文章标签： java 开发语言

于 2024-01-23 17:48:47 首次发布

本文链接：https://blog.csdn.net/weixin_43114209/article/details/135777823

版权

大数据专栏收录该内容

10 篇文章 0 订阅

订阅专栏

文章介绍了如何使用Trie树结构进行主关键词和副关键词的高效存储与匹配，包括插入、搜索和构建失败指针的过程，以及在文本中搜索关键词并返回匹配ID的功能。

摘要由CSDN通过智能技术生成

这个是上版的升级版本，废话不多说了，直接上代码了，有时间笔者再给大家详细进行阐述，最近被这问题搞的有点累（其中searchPairs方法为返回关键词组和额外id的封装有需要可以调用）：

package cn.konne.im.common.konneimcommon.utils;


import cn.konne.im.common.konneimcommon.model.KeyWordBean;
import com.alibaba.fastjson.JSONObject;
import com.sun.javafx.collections.MappingChange;
import org.apache.lucene.util.MapOfSets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;

import java.util.*;
import java.util.concurrent.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Trie树节点类
 */
class TrieNode {
    Map<Character, TrieNode> children; // 子节点映射表
    boolean isEndOfWord; // 是否是关键词结尾
    TrieNode fail; // 失败指针，指向其他节点
    Set<String> mainKeywords; // 主关键词集合
    Set<String> subKeywords; // 副关键词集合

    public TrieNode() {
        children = new HashMap<>();
        isEndOfWord = false;
        fail = null;
        mainKeywords = new HashSet<>();
        subKeywords = new HashSet<>();
    }
}

/**
 * Trie树类
 */
class Trie {
    private TrieNode root;

    public Trie() {
        root = new TrieNode();
    }

    /**
     * 插入主关键词到Trie树
     *
     * @param mainKeywords 主关键词，逗号分隔
     */
    public void insertMainKeywords(String mainKeywords) {
        if (StringUtils.isEmpty(mainKeywords)) {
            return;
        }
        String[] keywordsArray = mainKeywords.split(" ");
        TrieNode current = root;

        // 插入主关键词
        for (String keyword : keywordsArray) {
            for (char ch : keyword.toCharArray()) {
                current.children.putIfAbsent(ch, new TrieNode());
                current = current.children.get(ch);
            }
            current.isEndOfWord = true;
            current.mainKeywords.add(keyword);

            // 重置到根节点，以插入下一个关键词
            current = root;
        }
    }

    /**
     * 插入副关键词到Trie树
     *
     * @param subKeywords 副关键词，逗号分隔
     */
    public void insertSubKeywords(String subKeywords) {
        if (StringUtils.isEmpty(subKeywords)) {
            return;
        }
        String[] keywordsArray = subKeywords.split(" ");
        TrieNode current = root;

        // 插入副关键词
        for (String keyword : keywordsArray) {
            for (char ch : keyword.toCharArray()) {
                current.children.putIfAbsent(ch, new TrieNode());
                current = current.children.get(ch);
            }
            current.isEndOfWord = true;
            current.subKeywords.add(keyword);

            // 重置到根节点，以插入下一个关键词
            current = root;
        }
    }


    /**
     * 构建Trie树的失败指针，用于KMP算法
     */
    public void buildFailPointers() {
        Queue<TrieNode> queue = new LinkedList<>();
        for (TrieNode child : root.children.values()) {
            child.fail = root;
            queue.add(child);
        }

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();
            for (Map.Entry<Character, TrieNode> entry : current.children.entrySet()) {
                char ch = entry.getKey();
                TrieNode child = entry.getValue();

                TrieNode failNode = current.fail;
                while (failNode != null && !failNode.children.containsKey(ch)) {
                    failNode = failNode.fail;
                }

                if (failNode == null) {
                    child.fail = root;
                } else {
                    child.fail = failNode.children.get(ch);
                    if (!child.mainKeywords.isEmpty()) {
                        child.mainKeywords.addAll(child.fail.mainKeywords);
                    }
                    if (!child.subKeywords.isEmpty()) {
                        child.subKeywords.addAll(child.fail.subKeywords);
                    }
                }

                queue.add(child);
            }
        }
    }

    /**
     * 在文本中搜索主关键词，并返回匹配的主关键词集合
     *
     * @param text 要匹配的文本串
     * @return 匹配的主关键词集合
     */
    public Set<String> searchMainKeywords(String text) {
        TrieNode current = root;
        Set<String> matchedMainKeywords = new HashSet<>();
        StringBuilder matchedMainKeyword = new StringBuilder();

        for (char ch : text.toCharArray()) {
            while (current != root && !current.children.containsKey(ch)) {
                current = current.fail;
            }

            if (current.children.containsKey(ch)) {
                current = current.children.get(ch);
                matchedMainKeyword.append(ch);
                if (current.isEndOfWord) {
                    matchedMainKeywords.addAll(current.mainKeywords);
                }
            } else {
                current = root;
                matchedMainKeyword.setLength(0);
            }
        }

        return matchedMainKeywords;
    }

    /**
     * 在文本中搜索副关键词，并返回匹配的副关键词集合
     *
     * @param text 要匹配的文本串
     * @return 匹配的副关键词集合
     */
    public Set<String> searchSubKeywords(String text) {
        TrieNode current = root;
        Set<String> matchedSubKeywords = new HashSet<>();
        StringBuilder matchedSubKeyword = new StringBuilder();

        for (char ch : text.toCharArray()) {
            while (current != root && !current.children.containsKey(ch)) {
                current = current.fail;
            }

            if (current.children.containsKey(ch)) {
                current = current.children.get(ch);
                matchedSubKeyword.append(ch);
                if (current.isEndOfWord) {
                    matchedSubKeywords.addAll(current.subKeywords);
                }
            } else {
                current = root;
                matchedSubKeyword.setLength(0);
            }
        }

        return matchedSubKeywords;
    }

	
    /**
     * 返回关键词组和id
     * @param text 要匹配的文本
     * @param id 返回的额外id
     * @return 返回的结果集
     */
    private Set<Pair<String, Integer>> searchPairs(String text, int id) {
        Set<String> mainKeywords = searchMainKeywords(text);
        Set<String> subKeywords = searchSubKeywords(text);

        return mainKeywords.parallelStream()
                .flatMap(mainKeyword ->
                        subKeywords.isEmpty()
                                ? Stream.of(new Pair<>(mainKeyword, id))
                                : subKeywords.parallelStream().map(subKeyword -> new Pair<>(mainKeyword + subKeyword, id))
                )
                .collect(Collectors.toSet());
    }
  /**
     * 只返回匹配结果id
     * @param text要匹配的文本
     * @param id返回的额外id
     * @return 返回的结果集set-Id
     */
   private Set<Integer> searchPairsIds(String text, int id) {
    Set<String> mainKeywords = searchMainKeywords(text);
    Set<String> subKeywords = searchSubKeywords(text);

    return mainKeywords.parallelStream()
            .flatMap(mainKeyword ->
                    subKeywords.isEmpty()
                            ? Stream.of(id)
                            : subKeywords.parallelStream().map(subKeyword -> id)
            )
            .collect(Collectors.toSet());
	}


    public Map<String, Set<Integer>> searchMainAndCombinedPairs(String titleText, String matchText, int id)  {

        Map<String, Set<Integer>> setMap = new HashMap<>();
        setMap.put("titlePairs", searchPairsIds(titleText, id));
        setMap.put("contentPairs",  searchPairsIds(matchText, id));

        return setMap;
    }

}

class ChineseKeywordMatcher1 {

    private static final Logger log = LoggerFactory.getLogger(ChineseKeywordMatcher1.class);


    public static void main(String[] args) throws ExecutionException, InterruptedException{

        String str = "[\n" +
                "    {\n" +
                "        \"id\":8,\n" +
                "        \"mainKeyWord\":\"渭南\",\n" +
                "        \"subKeyWord\":\"白庙村\"\n" +
                "    },\n" +
                "    {\n" +
                "        \"id\":2,\n" +
                "        \"mainKeyWord\":\"咸阳\",\n" +
                "        \"subKeyWord\":\"钓台 \"\n" +
                "    },\n" +
                "    {\n" +
                "        \"id\":3,\n" +
                "        \"mainKeyWord\":\"富平\",\n" +
                "        \"subKeyWord\":null\n" +
                "    },\n" +
                "    {\n" +
                "        \"id\":4,\n" +
                "        \"mainKeyWord\":\"河阳\",\n" +
                "        \"subKeyWord\":\"东坡区 \"\n" +
                "    }\n" +
                "]";
        List<KeyWordBean> keyWordBeans = JSONObject.parseArray(str, KeyWordBean.class);

        Map<String, Set<Integer>> setMap = ChineseKeywordMatcher1.searchCombinationsParallel("河阳五洲国际装饰博览城", "西咸尔雅路", keyWordBeans);

        for (Map.Entry<String, Set<Integer>> stringSetEntry : setMap.entrySet()) {
            for (Integer i : stringSetEntry.getValue()) {
                log.info("【匹配】进入词组匹配 stringSetEntry.getKey:【{}】stringSetEntry-value:【{}】", stringSetEntry.getKey(), i);
            }

        }

    }
    /**
     * 并行搜索关键词组合
     *
     * @param titleText    标题文本
     * @param matchText    匹配文本
     * @param keyWordBeans 关键词列表
     * @return 匹配结果
     * @throws ExecutionException   执行异常
     * @throws InterruptedException 线程中断异常
     */
    public static Map<String, Set<Integer>> searchCombinationsParallel(String titleText, String matchText, List<KeyWordBean> keyWordBeans)
            throws ExecutionException, InterruptedException {
        int numThreads = Runtime.getRuntime().availableProcessors();
        ExecutorService executorService = Executors.newFixedThreadPool(numThreads);

        List<Callable<Map<String, Set<Integer>>>> tasks = new ArrayList<>();

        for (KeyWordBean wordBean : keyWordBeans) {
            tasks.add(() -> {
                Set<Integer> titleList = new HashSet<>();
                Set<Integer> contentList = new HashSet<>();
                Map<String, Set<Integer>> mapPairs = new HashMap<>();

                Trie trie = new Trie();
                trie.insertMainKeywords(wordBean.getMainKeyWord());
                trie.insertSubKeywords(wordBean.getSubKeyWord());
                trie.buildFailPointers();

                Map<String, Set<Integer>> entryIntegerMap = trie.searchMainAndCombinedPairs(titleText, matchText, wordBean.getId());

                if (!entryIntegerMap.get("titlePairs").isEmpty()) {
                    titleList.addAll(entryIntegerMap.get("titlePairs"));
                }

                if (!entryIntegerMap.get("contentPairs").isEmpty()) {
                    contentList.addAll(entryIntegerMap.get("contentPairs"));
                }

                mapPairs.put("titleList", titleList);
                mapPairs.put("contentList", contentList);

                return mapPairs;
            });
        }

        // 并行执行任务
        List<Future<Map<String, Set<Integer>>>> futures = executorService.invokeAll(tasks);

        executorService.shutdown();

        // 合并结果
        Map<String, Set<Integer>> combinedResult = new HashMap<>();
        for (Future<Map<String, Set<Integer>>> future : futures) {
            Map<String, Set<Integer>> result = future.get();
            for (Map.Entry<String, Set<Integer>> entry : result.entrySet()) {
                combinedResult.computeIfAbsent(entry.getKey(), k -> new HashSet<>()).addAll(entry.getValue());
            }
        }

        return combinedResult;
    }

}