思路:DFA算法
确定性有穷自动机,用于正则表达式的匹配,最长左子式匹配
/**
* 检测敏感词
*
* @param scriptText
* @param matchType
* @return
*/
public static Set<String> checkSensitiveWord(String scriptText, int matchType) {
Set<String> sensitiveWordSet = new HashSet<>();
for (int i = 0; i < scriptText.length(); i++) {
int length = testSensitiveWord(scriptText, i, matchType, sensitiveWordMap);
if (length > 0) {
sensitiveWordSet.add(scriptText.substring(i, i + length));
i = i + length - 1;
}
}
return sensitiveWordSet;
}
构建敏感词map
public static void initSensitiveWordMap(List<WordSenstive> wordSenstives) {
log.info("开始初始化敏感词map");
List<String> collect = wordSenstives.stream().map(a -> a.getSenstiveWord()).collect(Collectors.toList());
Set<String> keyWordSet = new HashSet<String>(collect);
Map<String, String> newWorMap = null;
String key = null;
Map nowMap = null;
sensitiveWordMap = new HashMap(keyWordSet.size());
Iterator<String> iterator = keyWordSet.iterator();
while (iterator.hasNext()) {
key = iterator.next();
if (key == null) {
continue;
}
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
char keyChar = key.charAt(i);
Object wordMap = nowMap.get(keyChar);
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
newWorMap = new HashMap<String, String>();
newWorMap.put("isEnd", "0");
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == key.length() - 1) {
nowMap.put("deepCount", i + 1 + "");
nowMap.put("isEnd", "1");
}
}
}
log.info("敏感词map构建完成");
}
匹配敏感词
private static int testSensitiveWord(String scriptText, int index, int matchType, Map sensitiveWordMap) {
boolean flag = false;
int matchFlag = 0;
char word = 0;
Map nowMap = sensitiveWordMap;
for (int i = index; i < scriptText.length(); i++) {
word = scriptText.charAt(i);
nowMap = (Map) nowMap.get(word);
if (nowMap != null) {
matchFlag++;//找到相应的key,匹配标识+1
if ("1".equals(nowMap.get("isEnd"))) {
Integer deepCount = Integer.valueOf((String) nowMap.get("deepCount"));
flag = isWord(scriptText, i, deepCount);
if (1 == matchType || flag) {//1:最小匹配,2:全匹配
break;
}
}
} else {
break;
}
}
if (matchFlag < 2 || !flag) {
matchFlag = 0;
}
return matchFlag;
}
匹配是否是单词
private static boolean isWord(String scriptText, int i, int deepCount) {
boolean isWord = true;
if (i - deepCount >= 0 && scriptText.charAt(i - deepCount) > 96 && scriptText.charAt(i - deepCount) < 123) {
isWord = false;
}
return isWord;
}