这个是上版的升级版本,废话不多说了,直接上代码了,有时间笔者再给大家详细进行阐述,最近被这问题搞的有点累(其中searchPairs方法为返回关键词组和额外id的封装有需要可以调用):
package cn.konne.im.common.konneimcommon.utils;
import cn.konne.im.common.konneimcommon.model.KeyWordBean;
import com.alibaba.fastjson.JSONObject;
import com.sun.javafx.collections.MappingChange;
import org.apache.lucene.util.MapOfSets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;
import java.util.*;
import java.util.concurrent.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Trie树节点类
*/
class TrieNode {
Map<Character, TrieNode> children; // 子节点映射表
boolean isEndOfWord; // 是否是关键词结尾
TrieNode fail; // 失败指针,指向其他节点
Set<String> mainKeywords; // 主关键词集合
Set<String> subKeywords; // 副关键词集合
public TrieNode() {
children = new HashMap<>();
isEndOfWord = false;
fail = null;
mainKeywords = new HashSet<>();
subKeywords = new HashSet<>();
}
}
/**
* Trie树类
*/
class Trie {
private TrieNode root;
public Trie() {
root = new TrieNode();
}
/**
* 插入主关键词到Trie树
*
* @param mainKeywords 主关键词,逗号分隔
*/
public void insertMainKeywords(String mainKeywords) {
if (StringUtils.isEmpty(mainKeywords)) {
return;
}
String[] keywordsArray = mainKeywords.split(" ");
TrieNode current = root;
// 插入主关键词
for (String keyword : keywordsArray) {
for (char ch : keyword.toCharArray()) {
current.children.putIfAbsent(ch, new TrieNode());
current = current.children.get(ch);
}
current.isEndOfWord = true;
current.mainKeywords.add(keyword);
// 重置到根节点,以插入下一个关键词
current = root;
}
}
/**
* 插入副关键词到Trie树
*
* @param subKeywords 副关键词,逗号分隔
*/
public void insertSubKeywords(String subKeywords) {
if (StringUtils.isEmpty(subKeywords)) {
return;
}
String[] keywordsArray = subKeywords.split(" ");
TrieNode current = root;
// 插入副关键词
for (String keyword : keywordsArray) {
for (char ch : keyword.toCharArray()) {
current.children.putIfAbsent(ch, new TrieNode());
current = current.children.get(ch);
}
current.isEndOfWord = true;
current.subKeywords.add(keyword);
// 重置到根节点,以插入下一个关键词
current = root;
}
}
/**
* 构建Trie树的失败指针,用于KMP算法
*/
public void buildFailPointers() {
Queue<TrieNode> queue = new LinkedList<>();
for (TrieNode child : root.children.values()) {
child.fail = root;
queue.add(child);
}
while (!queue.isEmpty()) {
TrieNode current = queue.poll();
for (Map.Entry<Character, TrieNode> entry : current.children.entrySet()) {
char ch = entry.getKey();
TrieNode child = entry.getValue();
TrieNode failNode = current.fail;
while (failNode != null && !failNode.children.containsKey(ch)) {
failNode = failNode.fail;
}
if (failNode == null) {
child.fail = root;
} else {
child.fail = failNode.children.get(ch);
if (!child.mainKeywords.isEmpty()) {
child.mainKeywords.addAll(child.fail.mainKeywords);
}
if (!child.subKeywords.isEmpty()) {
child.subKeywords.addAll(child.fail.subKeywords);
}
}
queue.add(child);
}
}
}
/**
* 在文本中搜索主关键词,并返回匹配的主关键词集合
*
* @param text 要匹配的文本串
* @return 匹配的主关键词集合
*/
public Set<String> searchMainKeywords(String text) {
TrieNode current = root;
Set<String> matchedMainKeywords = new HashSet<>();
StringBuilder matchedMainKeyword = new StringBuilder();
for (char ch : text.toCharArray()) {
while (current != root && !current.children.containsKey(ch)) {
current = current.fail;
}
if (current.children.containsKey(ch)) {
current = current.children.get(ch);
matchedMainKeyword.append(ch);
if (current.isEndOfWord) {
matchedMainKeywords.addAll(current.mainKeywords);
}
} else {
current = root;
matchedMainKeyword.setLength(0);
}
}
return matchedMainKeywords;
}
/**
* 在文本中搜索副关键词,并返回匹配的副关键词集合
*
* @param text 要匹配的文本串
* @return 匹配的副关键词集合
*/
public Set<String> searchSubKeywords(String text) {
TrieNode current = root;
Set<String> matchedSubKeywords = new HashSet<>();
StringBuilder matchedSubKeyword = new StringBuilder();
for (char ch : text.toCharArray()) {
while (current != root && !current.children.containsKey(ch)) {
current = current.fail;
}
if (current.children.containsKey(ch)) {
current = current.children.get(ch);
matchedSubKeyword.append(ch);
if (current.isEndOfWord) {
matchedSubKeywords.addAll(current.subKeywords);
}
} else {
current = root;
matchedSubKeyword.setLength(0);
}
}
return matchedSubKeywords;
}
/**
* 返回关键词组和id
* @param text 要匹配的文本
* @param id 返回的额外id
* @return 返回的结果集
*/
private Set<Pair<String, Integer>> searchPairs(String text, int id) {
Set<String> mainKeywords = searchMainKeywords(text);
Set<String> subKeywords = searchSubKeywords(text);
return mainKeywords.parallelStream()
.flatMap(mainKeyword ->
subKeywords.isEmpty()
? Stream.of(new Pair<>(mainKeyword, id))
: subKeywords.parallelStream().map(subKeyword -> new Pair<>(mainKeyword + subKeyword, id))
)
.collect(Collectors.toSet());
}
/**
* 只返回匹配结果id
* @param text要匹配的文本
* @param id返回的额外id
* @return 返回的结果集set-Id
*/
private Set<Integer> searchPairsIds(String text, int id) {
Set<String> mainKeywords = searchMainKeywords(text);
Set<String> subKeywords = searchSubKeywords(text);
return mainKeywords.parallelStream()
.flatMap(mainKeyword ->
subKeywords.isEmpty()
? Stream.of(id)
: subKeywords.parallelStream().map(subKeyword -> id)
)
.collect(Collectors.toSet());
}
public Map<String, Set<Integer>> searchMainAndCombinedPairs(String titleText, String matchText, int id) {
Map<String, Set<Integer>> setMap = new HashMap<>();
setMap.put("titlePairs", searchPairsIds(titleText, id));
setMap.put("contentPairs", searchPairsIds(matchText, id));
return setMap;
}
}
class ChineseKeywordMatcher1 {
private static final Logger log = LoggerFactory.getLogger(ChineseKeywordMatcher1.class);
public static void main(String[] args) throws ExecutionException, InterruptedException{
String str = "[\n" +
" {\n" +
" \"id\":8,\n" +
" \"mainKeyWord\":\"渭南\",\n" +
" \"subKeyWord\":\"白庙村\"\n" +
" },\n" +
" {\n" +
" \"id\":2,\n" +
" \"mainKeyWord\":\"咸阳\",\n" +
" \"subKeyWord\":\"钓台 \"\n" +
" },\n" +
" {\n" +
" \"id\":3,\n" +
" \"mainKeyWord\":\"富平\",\n" +
" \"subKeyWord\":null\n" +
" },\n" +
" {\n" +
" \"id\":4,\n" +
" \"mainKeyWord\":\"河阳\",\n" +
" \"subKeyWord\":\"东坡区 \"\n" +
" }\n" +
"]";
List<KeyWordBean> keyWordBeans = JSONObject.parseArray(str, KeyWordBean.class);
Map<String, Set<Integer>> setMap = ChineseKeywordMatcher1.searchCombinationsParallel("河阳五洲国际装饰博览城", "西咸尔雅路", keyWordBeans);
for (Map.Entry<String, Set<Integer>> stringSetEntry : setMap.entrySet()) {
for (Integer i : stringSetEntry.getValue()) {
log.info("【匹配】进入词组匹配 stringSetEntry.getKey:【{}】stringSetEntry-value:【{}】", stringSetEntry.getKey(), i);
}
}
}
/**
* 并行搜索关键词组合
*
* @param titleText 标题文本
* @param matchText 匹配文本
* @param keyWordBeans 关键词列表
* @return 匹配结果
* @throws ExecutionException 执行异常
* @throws InterruptedException 线程中断异常
*/
public static Map<String, Set<Integer>> searchCombinationsParallel(String titleText, String matchText, List<KeyWordBean> keyWordBeans)
throws ExecutionException, InterruptedException {
int numThreads = Runtime.getRuntime().availableProcessors();
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
List<Callable<Map<String, Set<Integer>>>> tasks = new ArrayList<>();
for (KeyWordBean wordBean : keyWordBeans) {
tasks.add(() -> {
Set<Integer> titleList = new HashSet<>();
Set<Integer> contentList = new HashSet<>();
Map<String, Set<Integer>> mapPairs = new HashMap<>();
Trie trie = new Trie();
trie.insertMainKeywords(wordBean.getMainKeyWord());
trie.insertSubKeywords(wordBean.getSubKeyWord());
trie.buildFailPointers();
Map<String, Set<Integer>> entryIntegerMap = trie.searchMainAndCombinedPairs(titleText, matchText, wordBean.getId());
if (!entryIntegerMap.get("titlePairs").isEmpty()) {
titleList.addAll(entryIntegerMap.get("titlePairs"));
}
if (!entryIntegerMap.get("contentPairs").isEmpty()) {
contentList.addAll(entryIntegerMap.get("contentPairs"));
}
mapPairs.put("titleList", titleList);
mapPairs.put("contentList", contentList);
return mapPairs;
});
}
// 并行执行任务
List<Future<Map<String, Set<Integer>>>> futures = executorService.invokeAll(tasks);
executorService.shutdown();
// 合并结果
Map<String, Set<Integer>> combinedResult = new HashMap<>();
for (Future<Map<String, Set<Integer>>> future : futures) {
Map<String, Set<Integer>> result = future.get();
for (Map.Entry<String, Set<Integer>> entry : result.entrySet()) {
combinedResult.computeIfAbsent(entry.getKey(), k -> new HashSet<>()).addAll(entry.getValue());
}
}
return combinedResult;
}
}