JAVA两种实现文本敏感词检测的方式

1.基于DFA算法的实现

1.1 DFA介绍

DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。

1.2构建模型

举个例子来说,在bash脚本命令检测中,我们规定“rm”,“reboot”,“shutdown”,"::","/dev/null","rmr"为敏感词,则我们需要根据这6个敏感词来构建检测模型,使用json格式来表示:

{"r":{"e":{"b":{"isEnd":"0","o":{"isEnd":"0","o":{"t":{"deepCount":"6","isEnd":"1"},"isEnd":"0"}}},"isEnd":"0"},"isEnd":"0","m":{"r":{"deepCount":"3","isEnd":"1"},"deepCount":"2","isEnd":"1"}},":":{":":{"deepCount":"2","isEnd":"1"},"isEnd":"0"},"s":{"h":{"u":{"t":{"d":{"isEnd":"0","o":{"w":{"isEnd":"0","n":{"deepCount":"8","isEnd":"1"}},"isEnd":"0"}},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"},"/":{"d":{"e":{"v":{"isEnd":"0","/":{"isEnd":"0","n":{"u":{"l":{"l":{"deepCount":"9","isEnd":"1"},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"}}},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"}}

代码实现:

private static Map initSensitiveWordMap(){
        String key = null;
        Map nowMap = null;
        Map<String,String> newWorMap = null;
        Set<String> sensitiveWordSet = new HashSet<>();
        sensitiveWordSet.add("rm");
        sensitiveWordSet.add("reboot");
        sensitiveWordSet.add("shutdown");
        sensitiveWordSet.add("::");
        sensitiveWordSet.add("/dev/null");
        sensitiveWordSet.add("rmr");
        Map sensitiveWordMap = new HashMap(sensitiveWordSet.size());
        Iterator<String> iterator = sensitiveWordSet.iterator();
        while (iterator.hasNext()){
            key = iterator.next().toLowerCase();
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                Object wordMap = nowMap.get(keyChar);
                if(wordMap != null){
                    nowMap = (Map) wordMap;
                }else {
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd","0");
                    nowMap.put(keyChar,newWorMap);
                    nowMap = newWorMap;
                }
                if(i == key.length()-1){
                    nowMap.put("deepCount",i + 1 + "");
                    nowMap.put("isEnd","1");
                }
            }
        }
        System.out.println(JSONObject.toJSONString(sensitiveWordMap));
        return sensitiveWordMap;
    }

1.3检测脚本内容

public static Set<String> checkSensitiveWord(String scriptText, int matchType){

        Map sensitiveWordMap = initSensitiveWordMap();

        Set<String> sensitiveWordSet = new HashSet<>();
        for (int i = 0; i < scriptText.length(); i++) {
            int length = testSensitiveWord(scriptText,i,matchType,sensitiveWordMap);
            if(length > 0){
                sensitiveWordSet.add(scriptText.substring(i,i+length));
                i = i+length - 1;
            }
        }
        return sensitiveWordSet;
    }

private static int testSensitiveWord(String scriptText,int index,int matchType,Map sensitiveWordMap){
        boolean flag = false;
        int matchFlag = 0;
        char word = 0;
        Map nowMap = sensitiveWordMap;
        for (int i = index; i < scriptText.length(); i++) {
            word = scriptText.charAt(i);
            nowMap = (Map) nowMap.get(word);
            if(nowMap != null){
                matchFlag++;//找到相应的key,匹配标识+1
                if("1".equals(nowMap.get("isEnd"))){
                    Integer deepCount =Integer.valueOf((String) nowMap.get("deepCount"));
                    flag = isWord(scriptText,i,deepCount);
                    if(1 == matchType || flag){//1:最小匹配,2:全匹配
                        break;
                    }
                }
            } else {
                break;
            }
        }
        if(matchFlag < 2 || !flag){
            matchFlag = 0;
        }
        return matchFlag;
    }

private static boolean isWord(String scriptText, int i,int deepCount) {
        boolean isWord = true;
        if(i < scriptText.length()-1 && ' '!=scriptText.charAt(i+1)){
            isWord = false;
        }
        if(i - deepCount >= 0 && scriptText.charAt(i-deepCount) > 96 && scriptText.charAt(i-deepCount) < 123){
            isWord = false;
        }
        return isWord;
    }

2.基于动态正则法实现

private static Set<String> checkSensitiveWord(String scriptText){

        Set<String> base_set = new HashSet<>();
        base_set.add("rm");
        base_set.add("dd");
        base_set.add("reboot");

        Set<Character> characters = new HashSet<>();
        for (String s : base_set) {
            for (int i = 0; i < s.length(); i++) {
                sc.add(s.charAt(i));
            }
        }

        String all = "";
        for (Character character : characters) {
            all += character;
        }

        String unexpected_character_reg = "[^a-z0-9\\s]";//提取符号
        String unexpected_character = "";
        Matcher m = Pattern.compile(unexpected_character_reg).matcher(all);
        while(m.find()) {
            unexpected_character += m.group();
        }

        String regexPre = "[a-z0-9";
        String regexSuf = "]+";
        String main_regex = regexPre+unexpected_character+regexSuf;//组装最终检验字符串的正则表达式
        Matcher main_m = Pattern.compile(main_regex).matcher(script);
        Set<String> match_set = new HashSet();
        while (main_m.find()) {
            if (base_set.contains(main_m.group())){
                match_set.add(main_m.group());
            }
        }
        return match_set;

}

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值