AC自动机实现自然语言关键词高亮
坐标
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.8</version>
</dependency>
代码
import com.cmbchina.gsm.config.GSMGlobalConfig;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
/**
* <p>
* AC自动机关键词高亮处理
* </p>
*
* @version 1.0
* @author IT803300
* @date 2022-08-15
*/
public class KeywordMatchUtil {
/**
* 构建ac自动机
*
* @param keywords 关键词
* @return AhoCorasickDoubleArrayTrie
*/
public static AhoCorasickDoubleArrayTrie<String> buildACDT(Set<String> keywords) {
AhoCorasickDoubleArrayTrie<String> acdt = new AhoCorasickDoubleArrayTrie<>();
TreeMap<String, String> map = new TreeMap<>();
for (String keyword : keywords) {
map.put(keyword, keyword);
}
acdt.build(map);
return acdt;
}
/**
* 高亮处理
*
* @param originText 语言
* @param acdt ac自动机树
* @return String, 高亮后的结果
*/
public static String highLight(String originText, AhoCorasickDoubleArrayTrie<String> acdt) {
List<int[]> hitLocationList = new ArrayList<>();
// ac算法匹配关键词
acdt.parseText(originText, (begin, end, value) -> {
int[] indexPair = new int[2];
indexPair[0] = begin;
indexPair[1] = end - 1;
hitLocationList.add(indexPair);
});
// 构建bitmap
byte[] posStatus = new byte[originText.length()];
for (int[] item : hitLocationList) {
posStatus[item[0]] = 1;
for (int i = item[0]; i <= item[1]; i++) {
posStatus[i] = 1;
}
}
// 字符串拼接
int lastStatus = 0;
char[] charArray = originText.toCharArray();
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < posStatus.length; i++) {
if (posStatus[i] == lastStatus) {
stringBuilder.append(charArray[i]);
} else if (0 == lastStatus) {
stringBuilder.append(GSMGlobalConfig.highlightPrefix).append(charArray[i]);
lastStatus = 1;
} else if (1 == lastStatus) {
stringBuilder.append(GSMGlobalConfig.highlightSuffix).append(charArray[i]);
lastStatus = 0;
}
if (i == posStatus.length - 1 && 1 == lastStatus) {
stringBuilder.append(GSMGlobalConfig.highlightSuffix);
}
}
return stringBuilder.toString();
}
}