/**
* @Package_Name: com.sx.zkrcw.mgcgl.constants
* @Author:
* @Date: 2022/12/26
* @Time: 14:57
* @Description: TODO(敏感词缓存列表)
**/
import org.elasticsearch.common.Strings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class SensitiveWords {
/**
* 敏感词map,存储敏感词的有穷自动机树
*/
private static HashMap<String, Object> sensitiveWordsMap = null;
/**
* 描述:构造有穷自动机的树状数据结构
*
* @param sensitiveWords
* 敏感词数组
*/
private static void DFAConstructor(String[] sensitiveWords) {
HashMap<String, Object> sensitiveWordsMap1 = new HashMap<String, Object>(sensitiveWords.length);// 敏感词树结构构建
HashMap tempMap = null;// 在多层循环中充当指针,指向当前操作map的内存地址
HashMap<String, String> newMap = null;
for (String word : sensitiveWords) {
tempMap = sensitiveWordsMap1;// 每次解析新的敏感词树,需要将操作map指向最外层
for (int i = 0; i < word.length(); i++) {
char w = word.charAt(i);
Object wKey = tempMap.get(w + "");
if (wKey == null) {// 树根不存在,添加新的树
newMap = new HashMap<String, String>();
newMap.put("end", "0");
tempMap.put(w + "", newMap);
tempMap = newMap;
} else {// 树已存在,进入下层操作
tempMap = (HashMap) wKey;
}
if (i == word.length() - 1) {
// 敏感词结束,加标志位
tempMap.put("end", word);
}
}
}
sensitiveWordsMap = sensitiveWordsMap1;
}
/**
* 描述:对文本进行敏感词匹配
*
* @param text
* 需要校验的文本
* @return 监测出的敏感词列表
*/
public static HashMap<String, Integer> checkText(String text) {
HashMap<String, Integer> words = new HashMap<String, Integer>();
for (int i = 0; i < text.length(); i++) {
HashMap pointer = sensitiveWordsMap;
String word = checkOne(pointer, text, i, "");// 将当前下标位的字符,放入自动机往后匹配,并返回匹配结果
if (word.length() > 0) {
i += word.length() - 1;// 已经匹配到敏感词的文本段,跳过后续匹配,提高代码效率
Integer sl = words.get(word);
words.put(word, sl == null ? 1 : ++sl);// 将匹配的敏感词及出现次数放入map
}
}
return words;
}
public static String checkTextString(String text) {
StringBuffer words = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
HashMap pointer = sensitiveWordsMap;
String word = checkOne(pointer, text, i, "");// 将当前下标位的字符,放入自动机往后匹配,并返回匹配结果
if (!Strings.isNullOrEmpty(word))
words.append(word + ",");
}
String res = words.toString();
if (!Strings.isNullOrEmpty(res))
return words.substring(0, res.lastIndexOf(","));
return "";
}
/**
* 描述:对字符进行向后匹配
*
* @param pointer
* 有穷机指针
* @param text
* 文本
* @param index
* 下标
* @param word
* 敏感词结果
* @return 匹配到的敏感词,”“表示没有
*/
private static String checkOne(HashMap pointer, String text, int index, String word) {
if (index >= text.length()) {// 有穷机未走完,文本已结束
return "";
}
char key = text.charAt(index);
pointer = (HashMap) pointer.get(key + "");
if (pointer != null) {
String end = (String) pointer.get("end");
if ("0".equals(end)) {// 匹配成功,但未结束
word = checkOne(pointer, text, ++index, word);
} else if (end.length() > 0) {// 成功匹配敏感词
word = end;
pointer = sensitiveWordsMap;// 指针回跳
}
} else {
word = "";
pointer = sensitiveWordsMap;// 匹配失败,指针回跳
}
return word;
}
public static void initDFATrie(String[] sensitiveWords) {
DFAConstructor(sensitiveWords);
}
/**
* 过滤敏感词
*/
public static String filterSensitiveWords(String str){
Map<String,Integer> map = new HashMap<String,Integer>();
map = checkText(str);//收集敏感词
Set<String> words_set = map.keySet();//敏感词
Iterator<String> it =words_set.iterator();
while (it.hasNext()) {
String f_word = it.next();
str = str.replaceAll(f_word,"***");
}
return str;
}
}
DFA有穷自动机树实现敏感词缓存列表
于 2022-12-27 16:59:30 首次发布