【全程干货】程序员必备算法!AC自动机算法敏感词匹配算法!动画演示讲解,看完轻松掌握,面试官都被你唬住!!_哔哩哔哩_bilibili
著名的多模匹配算法
引入依赖:
<dependency> <groupId>org.ahocorasick</groupId> <artifactId>ahocorasick</artifactId> <version>0.6.3</version> </dependency>
public List<String> findKeywordsInText(String text, List<String> keywords) {
Trie trie = Trie.builder()
.addKeywords(keywords)
.build();
long start = System.currentTimeMillis();
Collection<Emit> emits = trie.parseText(text);
logger.info("trie: " + (System.currentTimeMillis() - start));
List<String> matchedKeywords = new ArrayList<>();
for (Emit emit : emits) {
matchedKeywords.add(emit.getKeyword());
}
return matchedKeywords;
}
main demo
public static void main(String[] args) {
String BASE_URL = "https://www.baidu.com/s?wd=";
String text = "abcd";
List<String> wordList = new ArrayList<>();
wordList.add("ab");
wordList.add("bc");
wordList.add("cd");
wordList.add("a");
wordList.add("bcd");
wordList.add("abc");
// 构建 Trie 时忽略大小写
Trie trie = Trie.builder()
.ignoreCase()
.addKeywords(wordList)
.build();
Collection<Emit> emits = trie.parseText(text);
// 按照关键词长度降序排序
List<Emit> sortedEmits = new ArrayList<>(emits);
sortedEmits.sort((e1, e2) -> e2.getKeyword().length() - e1.getKeyword().length());
// 移除开始位置在前面词的开始和结束中间的关键词
Set<String> processedKeywords = new HashSet<>();
List<Emit> filteredEmits = new ArrayList<>();
for (Emit emit : sortedEmits) {
String keyword = emit.getKeyword();
boolean shouldAdd = true;
for (Emit filteredEmit : filteredEmits) {
if (emit.getStart() >= filteredEmit.getStart() && emit.getStart() <= filteredEmit.getEnd()) {
shouldAdd = false;
break;
}
}
if (shouldAdd) {
filteredEmits.add(emit);
processedKeywords.add(keyword);
}
}
// 使用TreeMap来保持emit的start位置顺序
TreeMap<Integer, Emit> firstOccurrences = new TreeMap<>();
for (Emit emit : filteredEmits) {
firstOccurrences.putIfAbsent(emit.getStart(), emit);
}
StringBuilder result = new StringBuilder(text);
int offset = 0;
processedKeywords.clear();
for (Emit emit : firstOccurrences.values()) {
String keyword = emit.getKeyword();
if (!processedKeywords.contains(keyword)) {
int startIndex = emit.getStart() + offset;
int endIndex = emit.getEnd() + 1 + offset;
String link = "<a href=\"" + BASE_URL + keyword + "\" class='cnt-inner-link'>" + keyword + "</a>";
result.replace(startIndex, endIndex, link);
offset += link.length() - keyword.length();
processedKeywords.add(keyword);
}
}
System.out.println(result.toString());
}