java用DFA实现脏词过滤以及用FileAlterationListenerAdaptor实现对资源文件修改的动态监听

最新推荐文章于 2024-07-17 21:53:33 发布

生活不只*眼前的苟且

最新推荐文章于 2024-07-17 21:53:33 发布

阅读量1.6k

点赞数 1

分类专栏： Java

本文链接：https://blog.csdn.net/u011734144/article/details/51532220

版权

Java 专栏收录该内容

78 篇文章 2 订阅

订阅专栏

最近在跟朋友做一个博客网站，里面涉及一些脏词过滤算法，查了很多资料总结来说DFA算法是比较理想的，效率高。脏词本身是一个可以配置的东西，所以不能在程序中将脏词写死，否则要修改或者添加一些新的脏词还需要修改代码重新启动服务器，这显然是不可行的，所以脏词需要做成一个配置文件，并且当配置文件修改后，我不需要重启服务器就可以自动监测到修改后的脏词，这个是用FileAlterationListenerAdaptor实现的。废话不多说，直接上代码，我的代码分成主要分成了两个类，一个是监听器类，一个是脏词检查类。

1. 监听器类，主要实现导入配置文件中的脏词，以及实现对脏词文件的监听，实现脏词文件修改后自动导入

/*
 * Copyright 2002-2016 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.zuoxiaolong.blog.common.utils;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.monitor.FileAlterationListenerAdaptor;
import org.apache.commons.io.monitor.FileAlterationMonitor;
import org.apache.commons.io.monitor.FileAlterationObserver;

import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * 脏词配置文件监听器
 *
 * @author linjiedeng
 * @date 16/5/28 下午4:26
 * @since 1.0.0
 */
public class SensitiveWordMonitor extends FileAlterationListenerAdaptor {

    private static SensitiveWordMonitor sensitiveWordMonitor;

    public static Map sensitiveWordMap = new HashMap<>();

    private static String SENSITIVE_WORD_FILE_NAME = "bad-word.properties";

    //服务启动的时候载入脏词文件,并实现对文件的实时监听, 每10分钟监听一次
    static {
//获取资源文件的路径
        String classPathHole = SensitiveWordMonitor.class.getResource("/").toString();
//去掉开头的file：
        String classPath = classPathHole.substring(5);
/监听器类加载的时候首先要导入一次脏词文件
        File file = new File(classPath + SENSITIVE_WORD_FILE_NAME);
        Set<String> badWordSet = badWordSet = loadBadWord(file);
        sensitiveWordMap = addSensitiveWordToHashMap(badWordSet);
        SensitiveWordMonitor.getSensitiveWordMonitor().monitor(classPath, 10 * 60 * 1000);
    }

    private SensitiveWordMonitor() {

    }

    public static SensitiveWordMonitor getSensitiveWordMonitor() {
        if(sensitiveWordMonitor == null) {
            synchronized (SensitiveWordMonitor.class) {
                if(sensitiveWordMonitor == null) {
                    sensitiveWordMonitor = new SensitiveWordMonitor();
                }
            }
        }

        return sensitiveWordMonitor;
    }

    private static Set<String> loadBadWord(File file) {
        Set<String> badWordSet = new HashSet<>();
        try {
            LineIterator it = FileUtils.lineIterator(file);
            while(it.hasNext()) {
                String badWord = it.nextLine();
                badWordSet.add(badWord);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        return badWordSet;
    }

    @Override
    public void onFileChange(File file) {
        Set<String> badWordSet = badWordSet = loadBadWord(file);
        sensitiveWordMap = addSensitiveWordToHashMap(badWordSet);
    }

    private static Map addSensitiveWordToHashMap(Set<String> badWordSet) {

        Map wordMap = new HashMap(badWordSet.size());

        for (String word : badWordSet) {
            Map currentMap = wordMap;
            for (int i = 0; i < word.length(); i++) {

                char keyChar = word.charAt(i);
                Object tempMap = currentMap.get(keyChar);

                if (tempMap != null) {
                    currentMap = (Map) tempMap;
                } else {
                    Map<String, String> newMap = new HashMap<String, String>();
                    newMap.put("isEnd", "0");

                    currentMap.put(keyChar, newMap);
                    currentMap = newMap;
                }

                if (i == word.length() - 1) {
                    currentMap.put("isEnd", "1");
                }
            }
        }

        return wordMap;
    }

    public void monitor(String directory, int interval) {
        FileAlterationObserver fileAlterationObserver = new FileAlterationObserver(directory, FileFilterUtils.and(FileFilterUtils.nameFileFilter(SENSITIVE_WORD_FILE_NAME)), null);
        fileAlterationObserver.addListener(this);
        FileAlterationMonitor fileAlterationMonitor = new FileAlterationMonitor(interval, fileAlterationObserver);

        try {
            fileAlterationMonitor.start();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

2. SensitiveWordCheckUtils脏词检查工具类，实现对脏词的检查判断，当然也可以实现替换，这里没有写，但是弄清楚了原理的话实现起来很简单

/*
 * Copyright 2002-2016 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.zuoxiaolong.blog.common.utils;

import java.util.Map;

/**
 * 敏感词检查工具类
 *
 * @author linjiedeng
 * @date 16/5/28 下午7:02
 * @since 1.0.0
 */
public class SensitiveWordCheckUtils {

    public static int MIN_MATCH_TYPE = 1;   //最小匹配规则, 目前默认的匹配规则

    public static int MAX_MATCH_TYPE = 2;   //最大匹配规则

    /**
     * 判断是否含敏感词
     * @param sentence
     * @return
     */
    public static boolean isContainSensitiveWord(String sentence) {
        boolean flag = false;
        for (int i = 0; i < sentence.length(); i++) {
            if (hasSensitiveWord(sentence, i)) {
                return true;
            }
        }
        return false;
    }

    /**
     * 是否包含敏感词
     * @param sentence
     * @param startCheckIndex
     * @return
     */
    private static boolean hasSensitiveWord(String sentence, int startCheckIndex) {
        int matchCount = 0;
        boolean findFlag = false;
        Map currentMap = SensitiveWordMonitor.sensitiveWordMap;
        for (int i = startCheckIndex; i < sentence.length(); i++) {
            char word = sentence.charAt(i);
            currentMap = (Map) currentMap.get(word);

            // 存在，则判断是否为最后一个
            if (currentMap != null) {
                matchCount++; // 找到相应key，匹配个数+1

                // 如果为最后一个匹配规则,结束循环，返回匹配标识数，目前默认最小匹配规则，所以找到结束就注解退出
                if ("1".equals(currentMap.get("isEnd"))) {
                    findFlag = true;
                    break;
                }
            } else {
                break;
            }
        }

        if (findFlag && matchCount > 1) {
            return true;
        } else {
            return false;
        }
    }
}

测试类我就不写了，亲测肯定是没问题的。

当然，这些代码也从如下几篇文章中获取了很多有价值的信息，感谢几位：
http://blog.csdn.net/daixinmei/article/details/42082205
http://www.tuicool.com/articles/QneYFv

生活不只*眼前的苟且

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录