过滤敏感词算法DFA实现

public class SensitiveWordsUtil {

    private final static String END_FLAG = " ";
    private final static String SPACE_REGEX = "\\s*";

    public static Map<String, Object> initSensitiveWordsMap(Set<String> sensitiveWords) {
        if (sensitiveWords == null || sensitiveWords.isEmpty()) {
            return Collections.emptyMap();
        }
        Map<String, Object> sensitiveWordsMap = new HashMap<>(sensitiveWords.size());
        String currentWord;
        Map<String, Object> currentMap;
        Map<String, Object> subMap;
        for (String sensitiveWord : sensitiveWords) {
            currentWord = sensitiveWord;
            if (currentWord == null || (currentWord = currentWord.replaceAll(SPACE_REGEX,"")).length() < 1) {
                continue;
            }
            currentMap = sensitiveWordsMap;
            for (int i = 0; i < currentWord.length(); i++) {
                String c = currentWord.charAt(i) + "";
                subMap = (Map<String, Object>) currentMap.get(c);
                if (subMap == null) {
                    subMap = new HashMap<>(sensitiveWords.size());
                    currentMap.put(c, subMap);
                }
                currentMap = subMap;
                if (i == currentWord.length() - 1) {
                    currentMap.put(END_FLAG, null);
                }
            }
        }
        return sensitiveWordsMap;
    }

    public static Set<String> getSensitiveWords(String text, MatchType matchType, Map<String, Object> sensitiveWordsMap) {
        if (text == null || (text = text.replaceAll(SPACE_REGEX,"")).length() == 0) {
            return Collections.emptySet();
        }
        Set<String> sensitiveWords = new HashSet<>();
        for (int i = 0; i < text.length(); i++) {
            int sensitiveWordLength = getSensitiveWordLength(text, i, sensitiveWordsMap);
            if (sensitiveWordLength > 0) {
                String sensitiveWord = text.substring(i, i + sensitiveWordLength);
                sensitiveWords.add(sensitiveWord);
                if (matchType == MatchType.MIN_MATCH) {
                    break;
                }
                i = i + sensitiveWordLength - 1;
            }
        }
        return sensitiveWords;
    }

    public static int getSensitiveWordLength(String text, int startIndex, Map<String, Object> sensitiveWordsMap) {
        if (text == null || (text = text.replaceAll(SPACE_REGEX,"")).length() == 0) {
            throw new IllegalArgumentException("The input text must not be empty.");
        }
        String c;
        int wordLength = 0;
        Map<String, Object> subMap = sensitiveWordsMap;
        for (int i = startIndex; i < text.length(); i++) {
            c = text.charAt(i) + "";
            subMap = (Map<String, Object>) subMap.get(c);
            if (subMap == null) {
                return 0;
            } else {
                wordLength++;
                if (subMap.containsKey(END_FLAG)) {
                    break;
                }
            }
        }
        return wordLength;
    }

    public static boolean containsSensitiveWord(String text, Map<String, Object> sensitiveWordsMap) {
        return getSensitiveWords(text, MatchType.MIN_MATCH, sensitiveWordsMap).size() > 0;
    }

    public enum MatchType {

        /**
         * 只需要找到一个敏感词就可以了
         */
        MIN_MATCH("最小匹配规则"),
        /**
         * 需要知道待检测文本中到底包含多少个敏感词
         */
        MAX_MATCH("最大匹配规则"),
        ;

        String desc;

        MatchType(String desc) {
            this.desc = desc;
        }
    }

    public static void main(String[] args) {
        Set<String> sensitiveWords=new HashSet<>();
        sensitiveWords.add("秀");
        sensitiveWords.add("低俗");

        String text="你真秀,低 俗嘞";
        System.out.println(SensitiveWordsUtil.containsSensitiveWord(text, SensitiveWordsUtil.initSensitiveWordsMap(sensitiveWords)));
    }

}

参考文章 https://developer.aliyun.com/article/622759

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值