本文章提供一种基于DFA算法的敏感词检测JAVA程序片段,如下:
1.构造多叉树数据结构
import org.jetbrains.annotations.NotNull;
/**
* 多叉树
* @author mark
* @date 2023/2/28 9:50
*/
public class Word implements Comparable<Word> {
/**
* 当前节点元素
*/
public char c;
/**
* 是否是某个敏感词的最后一个节点
*/
public boolean isLast;
/**
* 子节点列表
*/
public NextList next;
public Word(char c) {
this.c = c;
}
@Override
public int compareTo(@NotNull Word word) {
return c - word.c;
}
}
2.自定义链表存储多叉树子节点
import java.io.Serializable;
import java.util.ArrayList;
/**
* @author mark
* @date 2023/2/28 9:50
*/
public class NextList extends ArrayList<Word> implements Serializable {
private static final long serialVersionUID = 6813338342890963116L;
public Word get(char c) {
for (Word word : this) {
if (c == word.c) {
return word;
}
}
return null;
}
/**
* 二分查找(需保证当前链表已按正序排序)
*/
public Word binaryGet(char c) {
int left = 0;
int right = this.size() - 1;
int middle;
Word word;
while (left <= right) {
middle = (left + right) / 2;
word = this.get(middle);
if (c == word.c) {
return word;
} else if (c > word.c) {
left = middle + 1;
} else {
right = middle - 1;
}
}
return null;
}
public Word add(char c) {
Word word = new Word(c);
super.add(word);
return word;
}
}
3.实现检测器,检测敏感词
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* 敏感词检测器
* @author mark
* @date 2023/2/28 10:32
*/
@Slf4j
public class SensitiveWordDetector {
/**
* 敏感词文件访问路径
*/
private final static String SENSITIVE_WORD_FILE = "resources\\sensitive\\word\\sensitive-word.txt";
/**
* 敏感词列表
*/
private static NextList wordList;
static {
loadSensitiveWords();
}
/**
* 检测输入文本是否包含敏感词
* @param text 输入文本
* @return true:包含敏感词;false:不包含
*/
public static boolean detect(String text) {
if (StringUtils.isBlank(text) || CollectionUtils.isEmpty(wordList)) {
return false;
}
char[] chars = text.toCharArray();
for (int i = 0; i < chars.length; i++) {
// 一级子节点
Word word = wordList.binaryGet(chars[i]);
if (word == null) {
continue;
}
// 下一级子节点
NextList childNodeList = word.next;
if (childNodeList == null) {
return true;
}
int j = i + 1;
for (; j < chars.length; j++) {
/*
* 以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
* 例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
* 假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
*/
Word childWord = childNodeList.get(chars[j]);
if (childWord == null) {
break;
}
if (childWord.isLast) {
return true;
}
childNodeList = childWord.next;
if (childNodeList == null) {
return true;
}
}
}
return false;
}
/**
* 加载敏感词(正序排序)
*/
public static void loadSensitiveWords() {
try {
InputStream inputStream = SensitiveWordDetector.class.getClassLoader().getResourceAsStream(SENSITIVE_WORD_FILE);
if (inputStream == null) {
return;
}
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8));
String line;
List<String> textList = new ArrayList<>();
while ((line = bufferedReader.readLine()) != null) {
textList.add(line);
}
bufferedReader.close();
wordList = convert(textList);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
private static NextList convert(List<String> textList) {
if (CollectionUtils.isEmpty(textList)) {
return null;
}
// 一级子节点列表
NextList wordList = new NextList();
for (String text : textList) {
if (StringUtils.isBlank(text)) {
continue;
}
// 当前节点
Word nowWord = null;
// 当前节点的子节点列表
NextList childNodeList = wordList;
char[] chars = text.toCharArray();
for (char aChar : chars) {
if (childNodeList == null) {
nowWord.next = new NextList();
childNodeList = nowWord.next;
}
nowWord = childNodeList.get(aChar);
if (nowWord == null) {
nowWord = childNodeList.add(aChar);
}
childNodeList = nowWord.next;
}
// 最后一个节点添加结束标识
if (nowWord != null) {
nowWord.isLast = true;
}
}
sort(wordList);
return wordList;
}
/**
* 所有子节点按正序排序
*/
private static void sort(List<Word> wordList) {
if (CollectionUtils.isEmpty(wordList)) {
return;
}
// 当级子节点排序
Collections.sort(wordList);
// 下级子节点递归排序
for (Word word : wordList) {
sort(word.next);
}
}
}