java 过滤关键字 自定义字典库

使用SensitiveWord方法进行过滤
以下为过滤demo

public class SensitiveWord {
	private StringBuilder replaceAll;//初始化
	private String encoding = "UTF-8";
	private String replceStr = "*";
	private int replceSize = 500;
	private String fileName = "CensorWords.txt";
	private List<String> arrayList;
	public Set<String> sensitiveWordSet;//包含的敏感词列表,过滤掉重复项
	public List<String> sensitiveWordList;//包含的敏感词列表,包括重复项,统计次数
/**
 * 文件要求路径在src或resource下,默认文件名为CensorWords.txt
 * @param fileName 词库文件名(含后缀)
 */
public SensitiveWord(String fileName)
{
	this.fileName = fileName;
}

/**
 * @param replceStr 敏感词被转换的字符
 * @param replceSize 初始转义容量
 */
public SensitiveWord(String replceStr,int replceSize)
{
	this.replceStr = fileName;
	this.replceSize = replceSize;
}

public SensitiveWord()
{
}

/**
 * @param str 将要被过滤信息
 * @return 过滤后的信息
 */
public String filterInfo(String str)
{  	sensitiveWordSet = new HashSet<String>();
	sensitiveWordList= new ArrayList<>();
	StringBuilder buffer = new StringBuilder(str);
	HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>(arrayList.size());
	String temp;
	for(int x = 0; x < arrayList.size();x++)
	{
		temp = arrayList.get(x);
		int findIndexSize = 0;
		for(int start = -1;(start=buffer.indexOf(temp,findIndexSize)) > -1;)
		{
			//System.out.println("###replace="+temp);
			findIndexSize = start+temp.length();//从已找到的后面开始找
			Integer mapStart = hash.get(start);//起始位置
			if(mapStart == null || (mapStart != null && findIndexSize > mapStart))//满足1个,即可更新map
			{
				hash.put(start, findIndexSize);
				//System.out.println("###敏感词:"+buffer.substring(start, findIndexSize));
			}
		}
	}
	Collection<Integer> values = hash.keySet();
	for(Integer startIndex : values)
	{
		Integer endIndex = hash.get(startIndex);
		//获取敏感词,并加入列表,用来统计数量
		String sensitive = buffer.substring(startIndex, endIndex);
		//System.out.println("###敏感词:"+sensitive);
		if (!sensitive.contains("*")) {//添加敏感词到集合
			sensitiveWordSet.add(sensitive);
			sensitiveWordList.add(sensitive);
		}
		buffer.replace(startIndex, endIndex, replaceAll.substring(0,endIndex-startIndex));
	}
	hash.clear();
	return buffer.toString();
}
/**
 *   初始化敏感词库
 */
public void InitializationWork()
{
	replaceAll = new StringBuilder(replceSize);
	for(int x=0;x < replceSize;x++)
	{
		replaceAll.append(replceStr);
	}
	//加载词库
	arrayList = new ArrayList<String>();
	InputStreamReader read = null;
	BufferedReader bufferedReader = null;
	try {
		read = new InputStreamReader(SensitiveWord.class.getClassLoader().getResourceAsStream(fileName),encoding);
		bufferedReader = new BufferedReader(read);
		for(String txt = null;(txt = bufferedReader.readLine()) != null;){
			if(!arrayList.contains(txt))
				arrayList.add(txt);
		}
	} catch (UnsupportedEncodingException e) {
		e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	}finally{
		try {
			if(null != bufferedReader)
				bufferedReader.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		try {
			if(null != read)
				read.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

public StringBuilder getReplaceAll() {
	return replaceAll;
}
public void setReplaceAll(StringBuilder replaceAll) {
	this.replaceAll = replaceAll;
}
public String getReplceStr() {
	return replceStr;
}
public void setReplceStr(String replceStr) {
	this.replceStr = replceStr;
}
public int getReplceSize() {
	return replceSize;
}
public void setReplceSize(int replceSize) {
	this.replceSize = replceSize;
}
public String getFileName() {
	return fileName;
}
public void setFileName(String fileName) {
	this.fileName = fileName;
}
public List<String> getArrayList() {
	return arrayList;
}
public void setArrayList(List<String> arrayList) {
	this.arrayList = arrayList;
}
public String getEncoding() {
	return encoding;
}
public void setEncoding(String encoding) {
	this.encoding = encoding;
}

public static void main(String[] args){
	long startNumer = System.currentTimeMillis();
	SensitiveWord sw = new SensitiveWord("CensorWords.txt");
	sw.InitializationWork();
	//System.out.println("敏感词的数量:" + arrayList.size());
	//怕被网页屏蔽   传入要被检测的字符串
	String str = "的晚上,关上电话静静的发呆着。";
	System.out.println("被检测字符串长度:"+str.length());
	str = sw.filterInfo(str);
	long endNumber = System.currentTimeMillis();
	//System.out.println("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
	//System.out.println("语句中包含敏感词的个数为:" + sensitiveWordList.size() + "。包含:" + sensitiveWordList);
	System.out.println("总共耗时:"+(endNumber-startNumer)+"ms");
	System.out.println("替换后的字符串为:\n"+str);
}

}
CensorWords.txt存放要屏蔽的关键字
//将CensorWords.txt放到resources下即可 关键字文件没通过审核… 自定义吧

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值