恶俗评论敏感词过滤,可直接拿去用,动态热更新
过滤算法
主要是实现一个前缀树的数据结构,项目启动时读取敏感词库并进行预处理化,保存到前缀树中,空间换时间,其中,filter方法为过滤方法。
/**
* @author cy c
* @date 2022/5/17 16:49
*/
@Component
public class SensitiveFilter {
private static final Logger logger = LoggerFactory.getLogger(SensitiveFilter.class);
/**
*替换符
*/
@Value("${replacement}")
private String REPLACEMENT;
private TrieNode rootNode = new TrieNode();
public TrieNode getRootNode() {
return rootNode;
}
public void setRootNode(TrieNode rootNode) {
this.rootNode = rootNode;
}
public void addKeyWord(String keyWord) {
TrieNode tempNode = rootNode;
for (int i = 0; i < keyWord.length(); i++) {
char c = keyWord.charAt(i);
TrieNode subNode = tempNode.getSubNode(c);
if (subNode == null) {
subNode = new TrieNode();
tempNode.addSubNode(c, subNode);
}
tempNode = subNode;
if (i == keyWord.length() - 1) {
tempNode.setKeywordEnd(true);
}
}
}
/***
* 过滤敏感词
* @param text 待过滤文本
* @return 过滤后的文本
*/
public String filter(String text) {
if (StringUtils.isBlank(text)) {
return null;
}
TrieNode tempNode = rootNode;
int begin = 0;
int position = 0;
StringBuilder sb = new StringBuilder();
while (position < text.length()) {
char c = text.charAt(position);
//跳过符号
if (isSymbol(c)) {
//若指针1处于根节点,将此符号计入结果,让指针2向下一步
if (tempNode == rootNode) {
sb.append(c);
begin++;
}
//无论符号在开头或中间,指针3都向下走一步
position++;
continue;
}
//检查下个节点
tempNode = tempNode.getSubNode(c);
if (tempNode == null) {
//以begin开头的字符串不是敏感词
sb.append(text.charAt(begin));
//进入下一个位置
position = ++begin;
//重新指向根节点
tempNode = rootNode;
} else if (tempNode.isKeywordEnd()) {
//发现了敏感词,将begin-position字符串替换掉
sb.append(REPLACEMENT);
//进入下一个位置
begin = ++position;
//重新指向根节点
tempNode = rootNode;
} else {
//检查下一个字符
position++;
}
}
//将最后一批计入结果
sb.append(text.substring(begin));
return sb.toString();
}
/**
*判断是否为符号
*/
private boolean isSymbol(Character c) {
//0x2E80-0x9FFF是东亚文字范围
return CharUtils.isAsciiAlphanumeric(c) && (c < 0x2E80 || c > 0x9FFF);
}
/**
*前缀树
*/
private class TrieNode {
//关键词结束的标识
private boolean isKeywordEnd = false;
//子节点(key是下级字符,value是下级节点)
private Map<Character, TrieNode> subNodes = new HashMap<>();
public boolean isKeywordEnd() {
return isKeywordEnd;
}
public void setKeywordEnd(boolean keywordEnd) {
isKeywordEnd = keywordEnd;
}
//添加子节点
public void addSubNode(Character c, TrieNode node) {
subNodes.put(c, node);
}
//获取子节点
public TrieNode getSubNode(Character c) {
return subNodes.get(c);
}
}
}
敏感词库加载及初始化
这里我们使用到了spring的容器刷新事件和commons-io包,commons-io包用于对文件的动态监听,事件用于项目启动初始化时开启文件的监听线程及前缀树的初始化操作
/**
* @author cy c
* @date 2022/5/17 19:36
*/
public class ContextRefreshedListener implements ApplicationListener<ContextRefreshedEvent> {
private volatile AtomicBoolean isInit = new AtomicBoolean(false);
private final FileAlterationMonitor fileAlterationMonitor;
private final SensitiveFilter sensitiveFilter;
@Value("${filePath.sensitivePath}")
private String monitorDir;
@Value("${filePath.sensitiveFile}")
private String monitorFile;
private final static Logger logger = LoggerFactory.getLogger(ContextRefreshedListener.class);
public ContextRefreshedListener(FileAlterationMonitor fileAlterationMonitor, SensitiveFilter sensitiveFilter) {
this.fileAlterationMonitor = fileAlterationMonitor;
this.sensitiveFilter = sensitiveFilter;
}
/**
* 容器刷新完成后初始化监听处理并加载恶评词
*/
@Override
public void onApplicationEvent(ContextRefreshedEvent event) {
//cas操作防止父子容器重复加载导致事件重复触发start报错
if (!isInit.compareAndSet(false, true)) {
return;
}
sensitiveInit();
fileAdaptorInit();
}
public void fileAdaptorInit() {
try {
logger.info(">>>>>>>>>>>>>>>>>>>文件监听开启<<<<<<<<<<<<<<<<<<<<<<");
fileAlterationMonitor.start();
} catch (Exception e) {
logger.error("监控出错:{}", e);
}
}
public void sensitiveInit() {
try (LineIterator it = FileUtils.lineIterator(new File(monitorDir+monitorFile))) {
while (it.hasNext()) {
String badWord = it.nextLine();
sensitiveFilter.addKeyWord(badWord);
}
} catch (IOException e) {
logger.error("恶评词加载失败:{}", e.getMessage());
}
logger.info(">>>>>>>>>>>>>>>>>>>加载恶评词完成<<<<<<<<<<<<<<<<<<<<<<");
}
}
敏感词库的动态热更新
继承FileAlterationListenerAdaptor,重写文件变动方法,当监听到文件变动时,初始化前缀树并替换
/**
* @author cy c
* @date 2022/5/17 16:49
*/
public class RefreshSensitiveAdaptor extends FileAlterationListenerAdaptor {
private final static Logger logger = LoggerFactory.getLogger(RefreshSensitiveAdaptor.class);
@Autowired
private SensitiveFilter sensitiveFilter;
/**
* 文件修改事件处理
*/
@Override
public void onFileChange(File file) {
logger.info("恶评词热更新:{}", file.getName());
updateBadWord(file);
logger.info("恶评词热更完成");
}
public void updateBadWord(File file) {
try (LineIterator it = FileUtils.lineIterator(file)) {
SensitiveFilter newSensitiveFilter = new SensitiveFilter();
while (it.hasNext()) {
String badWord = it.nextLine();
newSensitiveFilter.addKeyWord(badWord);
}
sensitiveFilter.setRootNode(newSensitiveFilter.getRootNode());
} catch (Exception e) {
logger.error("恶评词更新失败!:{}", e.getMessage());
}
}
}
bean配置
@Configuration
public class ListenerAutoConfig {
@Value("${filePath.sensitivePath}")
private String monitorDir;
long interval = TimeUnit.SECONDS.toMillis(20);
@Bean
@ConditionalOnMissingBean
public ContextRefreshedListener contextRefreshedListener(FileAlterationMonitor fileAlterationMonitor, SensitiveFilter sensitiveFilter) {
return new ContextRefreshedListener(fileAlterationMonitor, sensitiveFilter);
}
@Bean
@ConditionalOnMissingBean
public RefreshSensitiveAdaptor refreshBeanAdaptor() {
return new RefreshSensitiveAdaptor();
}
@Bean
@ConditionalOnMissingBean
public FileAlterationMonitor fileAlterationMonitor(RefreshSensitiveAdaptor refreshSensitiveAdaptor) {
//实例化观察者
FileAlterationObserver observer = new FileAlterationObserver(new File(monitorDir));
observer.addListener(refreshSensitiveAdaptor);
return new FileAlterationMonitor(interval, observer);
}
}
yml配置
文件每行一个敏感词
依赖
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>