最近有做敏感词管理的工作,在网上也查找了很多资料,最终用的构建前缀树的方法,过滤的速度还算满意,在此记录下。
/** * 敏感词处理 * @author yanyimin * @date 2017年11月17日 上午10:24:27 * @version 1.0.0 */ @Service public class SensitiveWordsBiz implements ApplicationListener<ContextRefreshedEvent>{ @Autowired SensitiveWordMapper sensitiveWordMapper; private Logger logger = LoggerFactory.getLogger(SensitiveWordsBiz.class); /** * 构造trie树 - 构建敏感词库 * @param set */ @SuppressWarnings("unchecked") public Map<String,Object> buildSensitiveWordMap(Set<String> set,Map<String,Object> sensitiveWordMap){ Map<String,Object> childMap = new HashMap<>(); // 子Map Map<String,Object> parentMap = new HashMap<>(); // 父map Iterator<String> iterator = set.iterator(); while (iterator.hasNext()){ char[] word = iterator.next().toCharArray(); parentMap = sensitiveWordMap; for (int i=0;i<word.length;i++){ childMap = (Map<String, Object>) parentMap.get(String.valueOf(word[i])); if(childMap==null){ childMap = new HashMap<>(); parentMap.put(String.valueOf(word[i]),childMap); // 如果该字符不存在,则插入 } parentMap = childMap; // 将子map的引用赋值给父map,进行下一轮循环 if(i==word.length-1){ parentMap.put("isEnd",true); }else{ parentMap.put("isEnd",false); } } } return sensitiveWordMap; } /** * 从敏感词库中去除敏感词 * @param set */ @SuppressWarnings("unchecked") public void removeSensitiveWordFromMap(Set<String> set){ Map<String,Object> sensitiveWordMap = getSensitiveWordMap(); Map<String,Object> childMap = new HashMap<>(); //子Map Map<String,Object> parentMap = new HashMap<>(); //父map Iterator<String> iterator = set.iterator(); while (iterator.hasNext()){ char[] word = iterator.next().toCharArray(); parentMap = sensitiveWordMap; for (int i=0;i<word.length;i++){ childMap = (Map<String, Object>) parentMap.get(String.valueOf(word[i])); if(childMap==null){ childMap = new HashMap<>(); parentMap.put(String.valueOf(word[i]),childMap); // 如果该字符不存在,则插入 } parentMap = childMap; // 将子map的引用赋值给父map,进行下一轮循环 parentMap.put("isEnd",false); //始终设置为false,则将此敏感词变相清除了 } } SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap); } /** * 在敏感词库中添加敏感词 * @param set */ public void addSensitiveWordFromMap(Set<String> set){ Map<String,Object> sensitiveWordMap = buildSensitiveWordMap(set, getSensitiveWordMap()); SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap); } /** * 重新构建 * @return */ public Map<String,Object> reBuild(){ Map<String,Object> sensitiveWordMap = new HashMap<>(); HashSet<String> set = getSensitiveWords(); sensitiveWordMap = buildSensitiveWordMap(set,new HashMap<String,Object>(set.size())); SensitiveWordsCache.TRIE_TREE.set("sensitiveWords",sensitiveWordMap,1 * 24 * 60 * 60 * 30); // 敏感词数据一般不会变动,设置一个月的超时时间 return sensitiveWordMap; } /** * 敏感词过滤 * @param text * @return */ public String doFilter(String text){ char[] word = text.toCharArray(); Map<String,Object> sensitiveWordMap = getSensitiveWordMap(); for(int i=0;i<word.length-1;i++){ int index = doFilter(word,sensitiveWordMap,i); if(index==0){ continue; }else{ String words = text.substring(i, index+1); text = text.replace(words,words.replaceAll("[\\s\\S]","*")); i = index; } } return text; } @SuppressWarnings("unchecked") public int doFilter(char[] word,Map<String,Object> currentMap,int i){ if(i>=word.length){ return 0; } String s = String.valueOf(word[i]); currentMap = (Map<String, Object>) currentMap.get(s.toLowerCase()); //英文都存小写的 if(currentMap!=null){ if((boolean)currentMap.get("isEnd")){ return i; }else{ return doFilter(word,currentMap,i+1); } } return 0; } /** * 是否包含敏感词 * @param text * @return */ public boolean isContian(String text){ char[] word = text.toCharArray(); Map<String,Object> sensitiveWordMap = getSensitiveWordMap(); for(int i=0;i<word.length;i++){ int index = doFilter(word,sensitiveWordMap,i); if(index==0){ continue; }else{ return true; } } return false; } /** * 获取敏感词 * @return */ public HashSet<String> getSensitiveWords(){ List<String> list = sensitiveWordMapper.getAllSensitiveWord(); HashSet<String> set= Sets.newHashSet(list); return set; } /** * 添加敏感词 * @param sensitiveWordList * @param uid * @return */ public void saveSensitiveWords(List<Map<String,Object>> sensitiveWordList,String uid){ List<SensitiveWord> list = new ArrayList<>(); Date date = new Date(); Set<String> set = new HashSet<String>(); for(Map<String,Object> map:sensitiveWordList){ SensitiveWord sensitiveWord = BeanUtils.copyToNewBean(map, SensitiveWord.class); set.add(sensitiveWord.getName()); sensitiveWord.setCreator(uid); sensitiveWord.setUpdator(uid); sensitiveWord.setCreateTime(date); sensitiveWord.setUpdateTime(date); sensitiveWord.setStatus(1); list.add(sensitiveWord); } sensitiveWordMapper.insertSelectiveBatch(list); //在敏感词库中添加敏感词 addSensitiveWordFromMap(set); } /** * 更新状态 * @param sensitiveWord */ public void updateStatus(SensitiveWord sensitiveWord){ Set<String> removeSet = new HashSet<>(); Set<String> addSet = new HashSet<>(); sensitiveWordMapper.updateByPrimaryKeySelective(sensitiveWord); Integer status = sensitiveWord.getStatus(); if(status==0){ removeSet.add(sensitiveWord.getName()); //状态更新为未启用,从敏感词库中删除 removeSensitiveWordFromMap(removeSet); }else if(status==1){ addSet.add(sensitiveWord.getName()); //状态更新为启用,添加到敏感词库中 addSensitiveWordFromMap(addSet); } } /** * 更新名称 * 必须rebuild才能使敏感词库更新 * @param sensitiveWord */ public void updateName(SensitiveWord sensitiveWord){ sensitiveWordMapper.updateByPrimaryKeySelective(sensitiveWord); reBuild(); } /** * 删除敏感词 * @param sensitiveWord */ public void deleteSensitiveWords(SensitiveWord sensitiveWord){ sensitiveWordMapper.deleteByPrimaryKey(sensitiveWord.getId()); //同时删除敏感词库中的敏感词 Set<String> set = new HashSet<>(); set.add(sensitiveWord.getName()); removeSensitiveWordFromMap(set); } /** * 获取敏感词 * @param sensitiveWord * @return */ @SuppressWarnings("unchecked") public PageDto<SensitiveWord> getSensitiveWord(SensitiveWord sensitiveWord,PageParam page){ Page<SensitiveWord> sensitiveWordPage = new Page<>(page.getPageNo(),page.getPageSize()); sensitiveWordPage.setRecords(sensitiveWordMapper.getSensitiveWord(sensitiveWord, sensitiveWordPage)); PageDto<SensitiveWord> sensitiveWordPageDto = BeanUtils.copyToNewBean(sensitiveWordPage, PageDto.class); return sensitiveWordPageDto; } /** * 初始化 */ @SuppressWarnings("unchecked") public Map<String,Object> getSensitiveWordMap(){ Object trieTree = SensitiveWordsCache.TRIE_TREE.get("sensitiveWords"); Map<String,Object> sensitiveWordMap = new HashMap<>(); if (trieTree!=null){ sensitiveWordMap = (Map<String, Object>) trieTree; }else { sensitiveWordMap = reBuild(); } return sensitiveWordMap; } @Override public void onApplicationEvent(ContextRefreshedEvent event) { if(event.getApplicationContext().getParent() == null) { logger.info("开始构建敏感词库"); getSensitiveWordMap(); logger.info("敏感词库构建完毕"); } } }