敏感词检测算法

思路:DFA算法

确定性有穷自动机,用于正则表达式的匹配,最长左子式匹配

   /**
     * 检测敏感词
     *
     * @param scriptText
     * @param matchType
     * @return
     */
    public static Set<String> checkSensitiveWord(String scriptText, int matchType) {
        Set<String> sensitiveWordSet = new HashSet<>();
        for (int i = 0; i < scriptText.length(); i++) {
            int length = testSensitiveWord(scriptText, i, matchType, sensitiveWordMap);
            if (length > 0) {
                sensitiveWordSet.add(scriptText.substring(i, i + length));
                i = i + length - 1;
            }
        }
        return sensitiveWordSet;
    }

构建敏感词map

 public static void initSensitiveWordMap(List<WordSenstive> wordSenstives) {
        log.info("开始初始化敏感词map");
        List<String> collect = wordSenstives.stream().map(a -> a.getSenstiveWord()).collect(Collectors.toList());
        Set<String> keyWordSet = new HashSet<String>(collect);
        Map<String, String> newWorMap = null;
        String key = null;
        Map nowMap = null;
        sensitiveWordMap = new HashMap(keyWordSet.size());
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            key = iterator.next();
            if (key == null) {
                continue;
            }
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                Object wordMap = nowMap.get(keyChar);
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }
                if (i == key.length() - 1) {
                    nowMap.put("deepCount", i + 1 + "");
                    nowMap.put("isEnd", "1");
                }
            }
        }
        log.info("敏感词map构建完成");
    }

匹配敏感词

 private static int testSensitiveWord(String scriptText, int index, int matchType, Map sensitiveWordMap) {
        boolean flag = false;
        int matchFlag = 0;
        char word = 0;
        Map nowMap = sensitiveWordMap;
        for (int i = index; i < scriptText.length(); i++) {
            word = scriptText.charAt(i);
            nowMap = (Map) nowMap.get(word);
            if (nowMap != null) {
                matchFlag++;//找到相应的key,匹配标识+1
                if ("1".equals(nowMap.get("isEnd"))) {
                    Integer deepCount = Integer.valueOf((String) nowMap.get("deepCount"));
                    flag = isWord(scriptText, i, deepCount);
                    if (1 == matchType || flag) {//1:最小匹配,2:全匹配
                        break;
                    }
                }
            } else {
                break;
            }
        }
        if (matchFlag < 2 || !flag) {
            matchFlag = 0;
        }
        return matchFlag;
    }

匹配是否是单词

   private static boolean isWord(String scriptText, int i, int deepCount) {
        boolean isWord = true;
        if (i - deepCount >= 0 && scriptText.charAt(i - deepCount) > 96 && scriptText.charAt(i - deepCount) < 123) {
            isWord = false;
        }
        return isWord;
    }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

爱打球的白师傅

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值