敏感词过滤

最近有做敏感词管理的工作,在网上也查找了很多资料,最终用的构建前缀树的方法,过滤的速度还算满意,在此记录下。

/**
 * 敏感词处理
 * @author yanyimin
 * @date 2017年11月17日 上午10:24:27
 * @version 1.0.0
 */
@Service
public class SensitiveWordsBiz implements ApplicationListener<ContextRefreshedEvent>{

    @Autowired
    SensitiveWordMapper sensitiveWordMapper;

    private Logger logger = LoggerFactory.getLogger(SensitiveWordsBiz.class);

    /**
     * 构造trie树 - 构建敏感词库
     * @param set
     */
   @SuppressWarnings("unchecked")
   public Map<String,Object> buildSensitiveWordMap(Set<String> set,Map<String,Object> sensitiveWordMap){
        Map<String,Object> childMap = new HashMap<>(); // 子Map
        Map<String,Object> parentMap = new HashMap<>(); // 父map

        Iterator<String> iterator = set.iterator();
        while (iterator.hasNext()){
            char[] word = iterator.next().toCharArray();
            parentMap = sensitiveWordMap;
            for (int i=0;i<word.length;i++){
                childMap = (Map<String, Object>) parentMap.get(String.valueOf(word[i]));
                if(childMap==null){
                    childMap = new HashMap<>();
                    parentMap.put(String.valueOf(word[i]),childMap); // 如果该字符不存在,则插入
                }

                parentMap = childMap; // 将子map的引用赋值给父map,进行下一轮循环
                if(i==word.length-1){
                    parentMap.put("isEnd",true);
                }else{
                    parentMap.put("isEnd",false);
                }
            }
        }
        
        return sensitiveWordMap;
    }
   
    /**
     * 从敏感词库中去除敏感词
     * @param set
     */
    @SuppressWarnings("unchecked")
   public void removeSensitiveWordFromMap(Set<String> set){
       Map<String,Object> sensitiveWordMap = getSensitiveWordMap();
       Map<String,Object> childMap = new HashMap<>(); //子Map
        Map<String,Object> parentMap = new HashMap<>(); //父map

        Iterator<String> iterator = set.iterator();
        while (iterator.hasNext()){
            char[] word = iterator.next().toCharArray();
            parentMap = sensitiveWordMap;
            for (int i=0;i<word.length;i++){
                childMap = (Map<String, Object>) parentMap.get(String.valueOf(word[i]));
                if(childMap==null){
                    childMap = new HashMap<>();
                    parentMap.put(String.valueOf(word[i]),childMap); // 如果该字符不存在,则插入
                }

                parentMap = childMap; // 将子map的引用赋值给父map,进行下一轮循环
                parentMap.put("isEnd",false); //始终设置为false,则将此敏感词变相清除了
            }
        }
        
        SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap);
    }

    /**
     * 在敏感词库中添加敏感词
     * @param set
     */
    public void addSensitiveWordFromMap(Set<String> set){
       Map<String,Object> sensitiveWordMap = buildSensitiveWordMap(set, getSensitiveWordMap());
       SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap);
    }
    
    /**
     * 重新构建
     * @return
     */
    public Map<String,Object> reBuild(){
       Map<String,Object> sensitiveWordMap = new HashMap<>();
       HashSet<String> set = getSensitiveWords();
        sensitiveWordMap = buildSensitiveWordMap(set,new HashMap<String,Object>(set.size()));
        SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap,1 * 24 * 60 * 60 * 30); // 敏感词数据一般不会变动,设置一个月的超时时间
        return sensitiveWordMap;
    }
    
    /**
     * 敏感词过滤
     * @param text
     * @return
     */
    public String doFilter(String text){
        char[] word = text.toCharArray();
        Map<String,Object> sensitiveWordMap = getSensitiveWordMap();
        for(int i=0;i<word.length-1;i++){
            int index = doFilter(word,sensitiveWordMap,i);
            if(index==0){
                continue;
            }else{
               String words = text.substring(i, index+1);
                text = text.replace(words,words.replaceAll("[\\s\\S]","*"));
                i = index;
            }
        }
        return text;
    }
    
    @SuppressWarnings("unchecked")
   public int doFilter(char[] word,Map<String,Object> currentMap,int i){
        if(i>=word.length){
            return 0;
        }
        String s = String.valueOf(word[i]);
        currentMap = (Map<String, Object>) currentMap.get(s.toLowerCase()); //英文都存小写的
        if(currentMap!=null){
            if((boolean)currentMap.get("isEnd")){
                return i;
            }else{
                return doFilter(word,currentMap,i+1);
            }
        }
        return 0;
    }

    /**
     * 是否包含敏感词
     * @param text
     * @return
     */
    public boolean isContian(String text){
        char[] word = text.toCharArray();
        Map<String,Object> sensitiveWordMap = getSensitiveWordMap();
        for(int i=0;i<word.length;i++){
            int index = doFilter(word,sensitiveWordMap,i);
            if(index==0){
                continue;
            }else{
                return true;
            }
        }
        return false;
    }

    /**
     * 获取敏感词
     * @return
     */
    public HashSet<String> getSensitiveWords(){
        List<String> list = sensitiveWordMapper.getAllSensitiveWord();
        HashSet<String> set= Sets.newHashSet(list);
        return set;
    }
    
    /**
     * 添加敏感词
     * @param sensitiveWordList
     * @param uid
     * @return
     */
    public void saveSensitiveWords(List<Map<String,Object>> sensitiveWordList,String uid){
       List<SensitiveWord> list = new ArrayList<>();
       Date date = new Date();
       Set<String> set = new HashSet<String>();
       for(Map<String,Object> map:sensitiveWordList){
          SensitiveWord sensitiveWord = BeanUtils.copyToNewBean(map, SensitiveWord.class);
          set.add(sensitiveWord.getName());
          sensitiveWord.setCreator(uid);
          sensitiveWord.setUpdator(uid);
          sensitiveWord.setCreateTime(date);
          sensitiveWord.setUpdateTime(date);
          sensitiveWord.setStatus(1);
          list.add(sensitiveWord);
       }
       sensitiveWordMapper.insertSelectiveBatch(list);
       //在敏感词库中添加敏感词
       addSensitiveWordFromMap(set);
    }

    /**
     * 更新状态
     * @param sensitiveWord
     */
    public void updateStatus(SensitiveWord sensitiveWord){
       Set<String> removeSet = new HashSet<>();
       Set<String> addSet = new HashSet<>();
       sensitiveWordMapper.updateByPrimaryKeySelective(sensitiveWord);
       Integer status = sensitiveWord.getStatus();
       if(status==0){
          removeSet.add(sensitiveWord.getName()); //状态更新为未启用,从敏感词库中删除
          removeSensitiveWordFromMap(removeSet);
       }else if(status==1){
          addSet.add(sensitiveWord.getName()); //状态更新为启用,添加到敏感词库中
          addSensitiveWordFromMap(addSet);
       }

    }
  
    /**
     * 更新名称
     * 必须rebuild才能使敏感词库更新
     * @param sensitiveWord
     */
    public void updateName(SensitiveWord sensitiveWord){
       sensitiveWordMapper.updateByPrimaryKeySelective(sensitiveWord);
       reBuild();
    }
    
    /**
     * 删除敏感词
     * @param sensitiveWord
     */
    public void deleteSensitiveWords(SensitiveWord sensitiveWord){
       sensitiveWordMapper.deleteByPrimaryKey(sensitiveWord.getId());
       //同时删除敏感词库中的敏感词
       Set<String> set = new HashSet<>();
       set.add(sensitiveWord.getName());
       removeSensitiveWordFromMap(set);
    }
    
    /**
     * 获取敏感词
     * @param sensitiveWord
     * @return
     */
    @SuppressWarnings("unchecked")
   public PageDto<SensitiveWord> getSensitiveWord(SensitiveWord sensitiveWord,PageParam page){
       Page<SensitiveWord> sensitiveWordPage = new Page<>(page.getPageNo(),page.getPageSize());
       sensitiveWordPage.setRecords(sensitiveWordMapper.getSensitiveWord(sensitiveWord, sensitiveWordPage));
       PageDto<SensitiveWord> sensitiveWordPageDto = BeanUtils.copyToNewBean(sensitiveWordPage, PageDto.class);
       return sensitiveWordPageDto;
    }
    
    /**
     * 初始化
     */
    @SuppressWarnings("unchecked")
    public Map<String,Object> getSensitiveWordMap(){
        Object trieTree = SensitiveWordsCache.TRIE_TREE.get("sensitiveWords");
        Map<String,Object> sensitiveWordMap = new HashMap<>();
        if (trieTree!=null){
           sensitiveWordMap = (Map<String, Object>) trieTree;
        }else {
           sensitiveWordMap = reBuild();
        }
       
        return sensitiveWordMap;
    }

   @Override
    public void onApplicationEvent(ContextRefreshedEvent event) {
        if(event.getApplicationContext().getParent() == null) {
            logger.info("开始构建敏感词库");
            getSensitiveWordMap();
            logger.info("敏感词库构建完毕");
        }
    }
}

转载于:https://my.oschina.net/yanyimin/blog/1593974

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值