public class SensitiveWordsUtil {
private final static String END_FLAG = " ";
private final static String SPACE_REGEX = "\\s*";
public static Map<String, Object> initSensitiveWordsMap(Set<String> sensitiveWords) {
if (sensitiveWords == null || sensitiveWords.isEmpty()) {
return Collections.emptyMap();
}
Map<String, Object> sensitiveWordsMap = new HashMap<>(sensitiveWords.size());
String currentWord;
Map<String, Object> currentMap;
Map<String, Object> subMap;
for (String sensitiveWord : sensitiveWords) {
currentWord = sensitiveWord;
if (currentWord == null || (currentWord = currentWord.replaceAll(SPACE_REGEX,"")).length() < 1) {
continue;
}
currentMap = sensitiveWordsMap;
for (int i = 0; i < currentWord.length(); i++) {
String c = currentWord.charAt(i) + "";
subMap = (Map<String, Object>) currentMap.get(c);
if (subMap == null) {
subMap = new HashMap<>(sensitiveWords.size());
currentMap.put(c, subMap);
}
currentMap = subMap;
if (i == currentWord.length() - 1) {
currentMap.put(END_FLAG, null);
}
}
}
return sensitiveWordsMap;
}
public static Set<String> getSensitiveWords(String text, MatchType matchType, Map<String, Object> sensitiveWordsMap) {
if (text == null || (text = text.replaceAll(SPACE_REGEX,"")).length() == 0) {
return Collections.emptySet();
}
Set<String> sensitiveWords = new HashSet<>();
for (int i = 0; i < text.length(); i++) {
int sensitiveWordLength = getSensitiveWordLength(text, i, sensitiveWordsMap);
if (sensitiveWordLength > 0) {
String sensitiveWord = text.substring(i, i + sensitiveWordLength);
sensitiveWords.add(sensitiveWord);
if (matchType == MatchType.MIN_MATCH) {
break;
}
i = i + sensitiveWordLength - 1;
}
}
return sensitiveWords;
}
public static int getSensitiveWordLength(String text, int startIndex, Map<String, Object> sensitiveWordsMap) {
if (text == null || (text = text.replaceAll(SPACE_REGEX,"")).length() == 0) {
throw new IllegalArgumentException("The input text must not be empty.");
}
String c;
int wordLength = 0;
Map<String, Object> subMap = sensitiveWordsMap;
for (int i = startIndex; i < text.length(); i++) {
c = text.charAt(i) + "";
subMap = (Map<String, Object>) subMap.get(c);
if (subMap == null) {
return 0;
} else {
wordLength++;
if (subMap.containsKey(END_FLAG)) {
break;
}
}
}
return wordLength;
}
public static boolean containsSensitiveWord(String text, Map<String, Object> sensitiveWordsMap) {
return getSensitiveWords(text, MatchType.MIN_MATCH, sensitiveWordsMap).size() > 0;
}
public enum MatchType {
/**
* 只需要找到一个敏感词就可以了
*/
MIN_MATCH("最小匹配规则"),
/**
* 需要知道待检测文本中到底包含多少个敏感词
*/
MAX_MATCH("最大匹配规则"),
;
String desc;
MatchType(String desc) {
this.desc = desc;
}
}
public static void main(String[] args) {
Set<String> sensitiveWords=new HashSet<>();
sensitiveWords.add("秀");
sensitiveWords.add("低俗");
String text="你真秀,低 俗嘞";
System.out.println(SensitiveWordsUtil.containsSensitiveWord(text, SensitiveWordsUtil.initSensitiveWordsMap(sensitiveWords)));
}
}
参考文章 https://developer.aliyun.com/article/622759