StringPointer .java过滤字符集对象
package com.flaginfo.wcard.sensitive;
import java.io.Serializable;
/**
* @Author Meng.Liu
* @Date 2018/2/2 14:52
*/
public class StringPointer implements Serializable, CharSequence, Comparable<StringPointer> {
private static final long serialVersionUID = 1L;
private final char[] value;
private final int offset;
private final int length;
private int hash = 0;
public StringPointer(String str) {
value = str.toCharArray();
offset = 0;
length = value.length;
}
public StringPointer(char[] value, int offset, int length) {
this.value = value;
this.offset = offset;
this.length = length;
}
/**
* 计算该位置后(包含)2个字符的hash值
*
* @param i 从 0 到 length - 2
* @return hash值
*/
public int nextTwoCharHash(int i) {
return 31 * value[offset + i] + value[offset + i + 1];
}
/**
* 计算该位置后(包含)2个字符和为1个int型的值<br/>
* int值相同表示2个字符相同
*
* @param i 从 0 到 length - 2
* @return int值
*/
public int nextTwoCharMix(int i) {
return (value[offset + i] << 16) | value[offset + i + 1];
}
/**
* 该位置后(包含)的字符串,是否以某个词(word)开头
*
* @param i 从 0 到 length - 2
* @param word 词
* @return 是否?
*/
public boolean nextStartsWith(int i, StringPointer word) {
if (word.length > length - i) {
return false;
}
for (int c = word.length - 1; c >= 0; c--) {
if (value[offset + i + c] != word.value[word.offset + c]) {
return false;
}
}
return true;
}
/**
* 填充(替换)
*
* @param begin 从此位置开始(含)
* @param end 到此位置结束(不含)
* @param fillWith 以此字符填充(替换)
*/
public void fill(int begin, int end, char fillWith) {
for (int i = begin; i < end; i++) {
value[offset + i] = fillWith;
}
}
@Override
public int length() {
return length;
}
@Override
public char charAt(int i) {
return value[offset + i];
}
public StringPointer substring(int begin) {
return new StringPointer(value, offset + begin, length - begin);
}
public StringPointer substring(int begin, int end) {
return new StringPointer(value, offset + begin, end - begin);
}
@Override
public CharSequence subSequence(int start, int end) {
return substring(start, end);
}
@Override
public String toString() {
return new String(value, offset, length);
}
@Override
public int hashCode() {
int h = hash;
if (h == 0 && length > 0) {
for (int i = 0; i < length; i++) {
h = 31 * h + value[offset + i];
}
hash = h;
}
return h;
}
@Override
public boolean equals(Object anObject) {
if (this == anObject) {
return true;
}
if (anObject instanceof StringPointer) {
StringPointer that = (StringPointer) anObject;
if (length == that.length) {
char v1[] = this.value;
char v2[] = that.value;
for (int i = 0; i < this.length; i++) {
if (v1[this.offset + i] != v2[that.offset + i]) {
return false;
}
}
return true;
}
}
return false;
}
@Override
public int compareTo(StringPointer that) {
int len1 = this.length;
int len2 = that.length;
int lim = Math.min(len1, len2);
char[] v1 = this.value;
char[] v2 = that.value;
int k = 0;
while (k < lim) {
char c1 = v1[this.offset + k];
char c2 = v2[that.offset + k];
if (c1 != c2) {
return c1 - c2;
}
k++;
}
return len1 - len2;
}
}
SensitiveNode.java过滤字符集链表
package com.flaginfo.wcard.sensitive;
import java.io.Serializable;
import java.util.TreeSet;
/**
* @Author Meng.Liu
* @Date 2018/2/2 14:51
*/
public class SensitiveNode implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 头两个字符的mix,mix相同,两个字符相同
*/
final int headTwoCharMix;
/**
* 所有以这两个字符开头的词表
*/
final TreeSet<StringPointer> words = new TreeSet<StringPointer>();
/**
* 下一个节点
*/
SensitiveNode next;
SensitiveNode(int headTwoCharMix){
this.headTwoCharMix = headTwoCharMix;
}
SensitiveNode(int headTwoCharMix, SensitiveNode parent){
this.headTwoCharMix = headTwoCharMix;
parent.next = this;
}
}
SensitiveFilter.java过滤字符处理函数
package com.flaginfo.wcard.sensitive;
import com.flaginfo.wcard.dao.WxSensitiveWordMapper;
import com.flaginfo.wcard.domain.WxSensitiveWord;
import com.flaginfo.wcard.domain.common.SysErrorCode;
import com.flaginfo.wcard.util.common.BusinessException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableSet;
/**
* @Author Meng.Liu
* @Date 2018/2/2 14:51
*/
public class SensitiveFilter implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(SensitiveFilter.class);
/**
* 为提高不命中时hash指向null的概率,该应为词数的数倍,
* 使得词库集合尽量稀疏以加快访问速度。
*/
static final int DEFAULT_INITIAL_CAPACITY = 131072;
private SensitiveNode[] sensitiveNodes = new SensitiveNode[DEFAULT_INITIAL_CAPACITY];
/**
* 单例数据对象
*/
private static SensitiveFilter instance = null;
public static SensitiveFilter getInstance(){
if( null == instance ){
synchronized (SensitiveFilter.class){
if( null == instance ){
try {
instance = new SensitiveFilter();
} catch (Exception e) {
logger.error("[Sensitive Filter] : Error, init sensitive exception : ", e);
}
}
}
}
return instance;
}
/**
* 该类为单例类
*/
private SensitiveFilter() {
}
public void initFilterData(WxSensitiveWordMapper wxSensitiveWordMapper) throws Exception{
if( null == wxSensitiveWordMapper ){
logger.error("[Sensitive Filter] : Error, cannot find bean WxSensitiveWordMapper.class");
throw new BusinessException(SysErrorCode.SYSTEM_ERROR);
}
//从数据库拉去所有的敏感词,可以从其他地方加载
List<WxSensitiveWord> wxSensitiveWords = wxSensitiveWordMapper.selectAll();
if( null == wxSensitiveWords ){
throw new BusinessException(SysErrorCode.EMPTY_DATA_ERROR);
}
for( WxSensitiveWord sensitiveWord : wxSensitiveWords ){
put(sensitiveWord.getSensitiveWord());
}
logger.info("[Sensitive Filter] : load sensitive word success, [{}] records were loaded.", wxSensitiveWords.size());
}
/**
* 增加一个敏感词,如果词的长度(trim后)小于2,则丢弃<br/>
*
* @param word
*/
public boolean put(String word){
if(word == null || word.trim().length() < 2){
return false;
}
if(word.length() == 2 && word.matches("\\w\\w")){
return false;
}
StringPointer sp = new StringPointer(word.trim());
int hash = sp.nextTwoCharHash(0);
int mix = sp.nextTwoCharMix(0);
int index = hash & (sensitiveNodes.length - 1);
SensitiveNode node = sensitiveNodes[index];
if(node == null){
node = new SensitiveNode(mix);
node.words.add(sp);
sensitiveNodes[index] = node;
}else{
for(;node != null; node = node.next){
if(node.headTwoCharMix == mix){
node.words.add(sp);
return true;
}
if(node.next == null){
new SensitiveNode(mix, node).words.add(sp);
return true;
}
}
}
return true;
}
/**
* 是否存在敏感词
* @param sentence 待检测的句子
* @return true:包含, false:不包含
*/
public Boolean checkIfIncludeSensitiveWordsInSentence(String sentence){
return filterSensitive(sentence);
}
/**
* 查找出句子中包含的敏感词
* @param sentence 待检测的句子
* @return
*/
public List<String> checkSensitiveWordsInSentence(String sentence){
final List<String> list = new ArrayList<String>();
filterSensitive(sentence, new FilterManager() {
@Override
public void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position) {
list.add(sensitiveSP.toString());
}
});
return list;
}
/**
* 使用指定字符替换句子中的敏感词,并返回替换后的字符串
* @param sentence 待检测的句子
* @param character 替换的字符
* @return
*/
public String replaceSensitiveWordsInSentence(String sentence, final Character character){
final List<String> list = new ArrayList<String>();
return filterSensitive(sentence, new FilterManager() {
@Override
public void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position) {
if( null != character ){
sentenceSP.fill(position, position + sensitiveSP.length(), character);
}
list.add(sensitiveSP.toString());
}
});
}
/**
* 过滤敏感词
* @param sentence 待过滤的句子
* @return
*/
public Boolean filterSensitive(String sentence){
StringPointer sp = new StringPointer(sentence);
int position = 0;
while(position < sp.length() - 1){
int step = 1;
int hash = sp.nextTwoCharHash(position);
SensitiveNode node = sensitiveNodes[hash & (sensitiveNodes.length - 1)];
if(node != null){
int mix = sp.nextTwoCharMix(position);
for(; node != null; node = node.next){
if(node.headTwoCharMix == mix){
NavigableSet<StringPointer> desSet = node.words.headSet(sp.substring(position), true);
if( !desSet.isEmpty() ){
for(StringPointer word: desSet.descendingSet()){
if(sp.nextStartsWith(position, word)){
//检验是否包含敏感词时不用扫描完,发现一个就认为包含
return true;
}
}
}
}
}
}
position += step;
}
return false;
}
/**
* 过滤敏感词
* @param sentence 待过滤的句子
* @param filterManager 对敏感词的处理方式
* @return
*/
public String filterSensitive(String sentence, FilterManager filterManager){
StringPointer sp = new StringPointer(sentence);
boolean replaced = false;
int position = 0;
while(position < sp.length() - 1){
int step = 1;
int hash = sp.nextTwoCharHash(position);
SensitiveNode node = sensitiveNodes[hash & (sensitiveNodes.length - 1)];
if(node != null){
int mix = sp.nextTwoCharMix(position);
outer:
for(; node != null; node = node.next){
if(node.headTwoCharMix == mix){
NavigableSet<StringPointer> desSet = node.words.headSet(sp.substring(position), true);
if( !desSet.isEmpty() ){
for(StringPointer word: desSet.descendingSet()){
if(sp.nextStartsWith(position, word)){
if( null != filterManager ){
filterManager.handle(sp, word, position);
}
step = word.length();
replaced = true;
break outer;
}
}
}
}
}
}
position += step;
}
if(replaced){
return sp.toString();
}else{
return sentence;
}
}
/**
* 敏感词处理接口
*/
public interface FilterManager{
/**
* 处理检测出来的敏感词
* @param sentenceSP 待检测句子
* @param sensitiveSP 检测到的敏感词
* @param position 敏感词在句子中的位置
* @return
*/
void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position);
}
public static void main(String[] args) {
Character a = null;
System.out.println(String.valueOf(a));
}
}