目录
一、准备工作
首先我要把DB里的敏感词初始化,DB中的数据是这样子的
CREATE TABLE `sensitive_word` (
`word_id` varchar(16) NOT NULL,
`word_content` varchar(64) DEFAULT NULL,
PRIMARY KEY (`word_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='敏感词表';
INSERT INTO `maple`.`sensitive_word`(`word_id`, `word_content`) VALUES ('1', '冰毒');
INSERT INTO `maple`.`sensitive_word`(`word_id`, `word_content`) VALUES ('2', '白粉');
INSERT INTO `maple`.`sensitive_word`(`word_id`, `word_content`) VALUES ('3', '大麻');
INSERT INTO `maple`.`sensitive_word`(`word_id`, `word_content`) VALUES ('4', '大坏蛋');
二、初始化数据
通过实现InitializingBean接口,在项目启动的时候,拿到敏感词做处理,处理之后会保存到一个Map中,格式是这样子的:
{冰={毒={isEnd=1}, isEnd=0}, 白={粉={isEnd=1}, isEnd=0}, 大={麻={isEnd=1}, isEnd=0, 坏={蛋={isEnd=1}, isEnd=0}}}
三、解析数据
当一段文本去做敏感词检测时,需要解析上面这个Map,具体代码如下:
package com.gane.maple.sensitiveword;
import com.gane.maple.entity.SensitiveWord;
import com.gane.maple.service.SensitiveWordService;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import java.util.*;
/**
* @author ODC-WHB
* @date 2021/3/8
*/
@Configuration
public class SensitiveWordFilter implements InitializingBean {
@Autowired
private SensitiveWordService sensitiveWordService;
// {冰={毒={isEnd=1}, isEnd=0}, 白={粉={isEnd=1}, isEnd=0}, 大={麻={isEnd=1}, isEnd=0, 坏={蛋={isEnd=1}, isEnd=0}}}
private static Map<Object, Object> sensitiveWordMap;
public static int minMatchType = 1; //最小匹配规则
public static int maxMatchType = 2; //最大匹配规则
/**
* 是否包含敏感词
*
* @param txt
* @return
*/
public boolean isSensitive(String txt) {
return isSensitive(txt, minMatchType);
}
/**
* 是否包含敏感词
*
* @param txt
* @param matchType
* @return
*/
public boolean isSensitive(String txt, int matchType) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
int matchFlag = this.checkSensitiveWord(txt, i, matchType);
if (matchFlag > 0) {
flag = true;
}
}
return flag;
}
/**
* 获取文本中的敏感词
*
* @param txt
* @return
*/
public Set<String> getSensitiveWord(String txt) {
return getSensitiveWord(txt, maxMatchType);
}
/**
* 获取文本中的敏感词
*
* @param txt
* @param matchType
* @return
*/
public Set<String> getSensitiveWord(String txt, int matchType) {
Set<String> sensitiveWordList = new HashSet<>();
for (int i = 0; i < txt.length(); i++) {
int length = checkSensitiveWord(txt, i, matchType);
if (length > 0) {
sensitiveWordList.add(txt.substring(i, i + length));
i = i + length - 1; //减1的原因,是因为for会自增
}
}
return sensitiveWordList;
}
/**
* 替换文本中的敏感词
*
* @param txt
* @param replaceChar
* @return
*/
public String replaceSensitiveWord(String txt, String replaceChar) {
return replaceSensitiveWord(txt, maxMatchType, replaceChar);
}
/**
* 替换文本中的敏感词
*
* @param txt
* @param matchType
* @param replaceChar
* @return
*/
public String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
String resultTxt = txt;
Set<String> set = this.getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word = null;
String replaceString = null;
while (iterator.hasNext()) {
word = iterator.next();
replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}
return resultTxt;
}
private String getReplaceChars(String replaceChar, int length) {
String resultReplace = replaceChar;
for (int i = 1; i < length; i++) {
resultReplace += replaceChar;
}
return resultReplace;
}
private int checkSensitiveWord(String txt, int beginIndex, int matchType) {
Map nowMap = sensitiveWordMap;
boolean flag = false;
char word = 0;
int matchFlag = 0;
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
nowMap = (Map) nowMap.get(word); //获取指定key
if (nowMap == null) {
break;
}
matchFlag++;
if (isEnd(nowMap)) {
flag = true;
if (SensitiveWordFilter.minMatchType == matchType) {
break;
}
}
}
if (matchFlag < 2 || !flag) {
matchFlag = 0;
}
return matchFlag;
}
/**
* 是否为最后一个
*
* @param nowMap
* @return
*/
private boolean isEnd(Map nowMap) {
boolean flag = false;
if ("1".equals(nowMap.get("isEnd"))) {
flag = true;
}
return flag;
}
@Override
public void afterPropertiesSet() throws Exception {
initSensitiveWord();
}
/**
* 初始化敏感词
*/
private void initSensitiveWord() {
List<SensitiveWord> list = sensitiveWordService.list();
Set<String> sensitiveWordSet = new HashSet<>();
list.stream().forEach(l -> {
sensitiveWordSet.add(l.getWordContent());
});
Iterator<String> iterator = sensitiveWordSet.iterator();
String key;
Map nowMap;
Map<String, String> newWorMap;
sensitiveWordMap = new HashMap(sensitiveWordSet.size());
while (iterator.hasNext()) {
key = iterator.next(); // 冰毒
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
char charKey = key.charAt(i); // 冰
Object wordMap = nowMap.get(charKey);
if (wordMap != null) {
nowMap = (Map) wordMap; //一个一个放进Map中
} else { //不存在,则构建一个Map,同时将isEnd设置为0,因为它不是最后一个
newWorMap = new HashMap<>();
newWorMap.put("isEnd", "0");//不是最后一个
nowMap.put(charKey, newWorMap);//没有这个key,就把(isEnd,0) 放在Map中
nowMap = newWorMap;
}
if (i == key.length() - 1) { //最后一个
nowMap.put("isEnd", "1");
}
}
System.out.println(sensitiveWordMap);
}
}
}
四、测试
TestController
package com.gane.maple.controller;
import com.gane.maple.sensitiveword.SensitiveWordFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.Set;
/**
* @author ODC-WHB
* @date 2021/3/8
*/
@RestController
public class TestController {
@Autowired
private SensitiveWordFilter sensitiveWordFilter;
@GetMapping("/test")
public Set<String> test(String text) {
System.out.println("原文为:" + text);
boolean sensitive = sensitiveWordFilter.isSensitive(text);
System.out.println(sensitive ? "原文中有敏感词" : "原文中没有敏感词");
Set<String> sensitiveWord = sensitiveWordFilter.getSensitiveWord(text);
System.out.println("敏感词有:");
sensitiveWord.forEach(System.out::println);
String replaceSensitiveWord = sensitiveWordFilter.replaceSensitiveWord(text, "*");
System.out.println("把敏感词替换成*为:" + replaceSensitiveWord);
return sensitiveWord;
}
}
启动项目,访问 localhost:8080/test?text=张三是个大坏蛋,他竟然吸食白粉和冰毒