敏感词检测

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

public class BadWordsUtil {

 public static final int WORDS_MAX_LENGTH = 10;
 public static final String BAD_WORDS_LIB_FILE_NAME ="badWords.txt";

//敏感词列表
 public static Map[] badWordsList = null;

//敏感词索引
 public static Map<String, Integer> wordIndex = new HashMap<String, Integer>();

/*
 * 初始化敏感词库
*/
 public static void initbadWordsList() throws IOException {
 if (badWordsList == null) {
 badWordsList = new Map[WORDS_MAX_LENGTH];

 for (int i = 0; i < badWordsList.length; i++) {
 badWordsList[i] = new HashMap<String, String>();
}
}

//敏感词词库所在目录,这里为txt文本,一个敏感词一行
 String path = BadWordsUtil.class.getClassLoader()
.getResource(BAD_WORDS_LIB_FILE_NAME)
.getPath();
System.out.println(path);

 List<String> words = FileUtils.readLines(new File(path),"UTF-8");

 for (String w : words) {
 if (StringUtils.isNotBlank(w)) {
//将敏感词按长度存入map
 badWordsList[w.length()].put(w.toLowerCase(),"");

 Integer index = wordIndex.get(w.substring(0, 1));

//生成敏感词索引,存入map
 if (index == null) {
 index = 0;
}

 int x = (int) Math.pow(2, w.length());
 index = (index | x);
 wordIndex.put(w.substring(0, 1), index);
}
}
}

/**
 * 检索敏感词
 * @param content
 * @return
*/
 public static List<String> searchBanWords(String content) {
 if (badWordsList == null) {
 try {
initbadWordsList();
 } catch (IOException e) {
 throw new RuntimeException(e);
}
}

 List<String> result = new ArrayList<String>();

 for (int i = 0; i < content.length(); i++) {
 Integer index = wordIndex.get(content.substring(i, i + 1));
 int p = 0;

 while ((index != null) && (index > 0)) {
p++;
 index = index >> 1;

 String sub ="";

 if ((i + p) < (content.length() - 1)) {
 sub = content.substring(i, i + p);
 } else {
 sub = content.substring(i);
}

 if (((index % 2) == 1) && badWordsList[p].containsKey(sub)) {
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

public class BadWordsUtil {

 public static final int WORDS_MAX_LENGTH = 10;
 public static final String BAD_WORDS_LIB_FILE_NAME ="badWords.txt";

//敏感词列表
 public static Map[] badWordsList = null;

//敏感词索引
 public static Map<String, Integer> wordIndex = new HashMap<String, Integer>();

/*
 * 初始化敏感词库
*/
 public static void initbadWordsList() throws IOException {
 if (badWordsList == null) {
 badWordsList = new Map[WORDS_MAX_LENGTH];

 for (int i = 0; i < badWordsList.length; i++) {
 badWordsList[i] = new HashMap<String, String>();
}
}

//敏感词词库所在目录,这里为txt文本,一个敏感词一行
 String path = BadWordsUtil.class.getClassLoader()
.getResource(BAD_WORDS_LIB_FILE_NAME)
.getPath();
System.out.println(path);

 List<String> words = FileUtils.readLines(new File(path),"UTF-8");

 for (String w : words) {
 if (StringUtils.isNotBlank(w)) {
//将敏感词按长度存入map
 badWordsList[w.length()].put(w.toLowerCase(),"");

 Integer index = wordIndex.get(w.substring(0, 1));

//生成敏感词索引,存入map
 if (index == null) {
 index = 0;
}

 int x = (int) Math.pow(2, w.length());
 index = (index | x);
 wordIndex.put(w.substring(0, 1), index);
}
}
}

/**
 * 检索敏感词
 * @param content
 * @return
*/
 public static List<String> searchBanWords(String content) {
 if (badWordsList == null) {
 try {
initbadWordsList();
 } catch (IOException e) {
 throw new RuntimeException(e);
}
}

 List<String> result = new ArrayList<String>();

 for (int i = 0; i < content.length(); i++) {
 Integer index = wordIndex.get(content.substring(i, i + 1));
 int p = 0;

 while ((index != null) && (index > 0)) {
p++;
 index = index >> 1;

 String sub ="";

 if ((i + p) < (content.length() - 1)) {
 sub = content.substring(i, i + p);
 } else {
 sub = content.substring(i);
}

 if (((index % 2) == 1) && badWordsList[p].containsKey(sub)) {
 result.add(content.substring(i, i + p));
}
}
}

 return result;
}

 public static void main(String[] args) throws IOException {
 String content ="含有敏感词的测试";
BadWordsUtil.initbadWordsList();
 List<String> badWordList = BadWordsUtil.searchBanWords(content);
 if (badWordList.size() == 0){
System.out.println("没有找到敏感词!");
}else{
 for(String s : badWordList){
System.out.println("找到敏感词:"+s);
}
}
}
result.add(content.substring(i, i + p));}}} return result;} public static void main(String[] args) throws IOException { String content ="含有敏感词的测试";BadWordsUtil.initbadWordsList(); List<String> badWordList = BadWordsUtil.searchBanWords(content); if (badWordList.size() == 0){System.out.println("没有找到敏感词!");}else{ for(String s : badWordList){System.out.println("找到敏感词:"+s);}}}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值