简单的敏感词过滤

StringPointer .java过滤字符集对象

package com.flaginfo.wcard.sensitive;

import java.io.Serializable;

/**
 * @Author Meng.Liu
 * @Date 2018/2/2 14:52
 */
public class StringPointer implements Serializable, CharSequence, Comparable<StringPointer> {

    private static final long serialVersionUID = 1L;

    private final char[] value;

    private final int offset;

    private final int length;

    private int hash = 0;

    public StringPointer(String str) {
        value = str.toCharArray();
        offset = 0;
        length = value.length;
    }

    public StringPointer(char[] value, int offset, int length) {
        this.value = value;
        this.offset = offset;
        this.length = length;
    }

    /**
     * 计算该位置后(包含)2个字符的hash值
     *
     * @param i 从 0 到 length - 2
     * @return hash值
     */
    public int nextTwoCharHash(int i) {
        return 31 * value[offset + i] + value[offset + i + 1];
    }

    /**
     * 计算该位置后(包含)2个字符和为1个int型的值<br/>
     * int值相同表示2个字符相同
     *
     * @param i 从 0 到 length - 2
     * @return int值
     */
    public int nextTwoCharMix(int i) {
        return (value[offset + i] << 16) | value[offset + i + 1];
    }

    /**
     * 该位置后(包含)的字符串,是否以某个词(word)开头
     *
     * @param i 从 0 到 length - 2
     * @param word 词
     * @return 是否?
     */
    public boolean nextStartsWith(int i, StringPointer word) {
        if (word.length > length - i) {
            return false;
        }
        for (int c = word.length - 1; c >= 0; c--) {
            if (value[offset + i + c] != word.value[word.offset + c]) {
                return false;
            }
        }
        return true;
    }

    /**
     * 填充(替换)
     *
     * @param begin    从此位置开始(含)
     * @param end      到此位置结束(不含)
     * @param fillWith 以此字符填充(替换)
     */
    public void fill(int begin, int end, char fillWith) {
        for (int i = begin; i < end; i++) {
            value[offset + i] = fillWith;
        }
    }

    @Override
    public int length() {
        return length;
    }

    @Override
    public char charAt(int i) {
        return value[offset + i];
    }

    public StringPointer substring(int begin) {
        return new StringPointer(value, offset + begin, length - begin);
    }

    public StringPointer substring(int begin, int end) {
        return new StringPointer(value, offset + begin, end - begin);
    }

    @Override
    public CharSequence subSequence(int start, int end) {
        return substring(start, end);
    }

    @Override
    public String toString() {
        return new String(value, offset, length);
    }

    @Override
    public int hashCode() {
        int h = hash;
        if (h == 0 && length > 0) {
            for (int i = 0; i < length; i++) {
                h = 31 * h + value[offset + i];
            }
            hash = h;
        }
        return h;
    }

    @Override
    public boolean equals(Object anObject) {
        if (this == anObject) {
            return true;
        }
        if (anObject instanceof StringPointer) {
            StringPointer that = (StringPointer) anObject;
            if (length == that.length) {
                char v1[] = this.value;
                char v2[] = that.value;
                for (int i = 0; i < this.length; i++) {
                    if (v1[this.offset + i] != v2[that.offset + i]) {
                        return false;
                    }
                }
                return true;
            }
        }
        return false;
    }

    @Override
    public int compareTo(StringPointer that) {
        int len1 = this.length;
        int len2 = that.length;
        int lim = Math.min(len1, len2);
        char[] v1 = this.value;
        char[] v2 = that.value;
        int k = 0;
        while (k < lim) {
            char c1 = v1[this.offset + k];
            char c2 = v2[that.offset + k];
            if (c1 != c2) {
                return c1 - c2;
            }
            k++;
        }
        return len1 - len2;
    }

}

SensitiveNode.java过滤字符集链表

package com.flaginfo.wcard.sensitive;

import java.io.Serializable;
import java.util.TreeSet;

/**
 * @Author Meng.Liu
 * @Date 2018/2/2 14:51
 */
public class SensitiveNode implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * 头两个字符的mix,mix相同,两个字符相同
     */
    final int headTwoCharMix;

    /**
     * 所有以这两个字符开头的词表
     */
    final TreeSet<StringPointer> words = new TreeSet<StringPointer>();

    /**
     * 下一个节点
     */
    SensitiveNode next;

    SensitiveNode(int headTwoCharMix){
        this.headTwoCharMix = headTwoCharMix;
    }

    SensitiveNode(int headTwoCharMix, SensitiveNode parent){
        this.headTwoCharMix = headTwoCharMix;
        parent.next = this;
    }

}

SensitiveFilter.java过滤字符处理函数

package com.flaginfo.wcard.sensitive;

import com.flaginfo.wcard.dao.WxSensitiveWordMapper;
import com.flaginfo.wcard.domain.WxSensitiveWord;
import com.flaginfo.wcard.domain.common.SysErrorCode;
import com.flaginfo.wcard.util.common.BusinessException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableSet;

/**
 * @Author Meng.Liu
 * @Date 2018/2/2 14:51
 */
public class SensitiveFilter implements Serializable {

    private static final long serialVersionUID = 1L;
    private static final Logger logger = LoggerFactory.getLogger(SensitiveFilter.class);

    /**
     * 为提高不命中时hash指向null的概率,该应为词数的数倍,
     * 使得词库集合尽量稀疏以加快访问速度。
     */
    static final int DEFAULT_INITIAL_CAPACITY = 131072;
    private SensitiveNode[] sensitiveNodes = new SensitiveNode[DEFAULT_INITIAL_CAPACITY];

    /**
     * 单例数据对象
     */
    private static SensitiveFilter instance = null;

    public static SensitiveFilter getInstance(){
        if( null == instance ){
            synchronized (SensitiveFilter.class){
                if( null == instance ){
                    try {
                        instance = new SensitiveFilter();
                    } catch (Exception e) {
                        logger.error("[Sensitive Filter] : Error, init sensitive exception : ", e);
                    }
                }
            }
        }
        return instance;
    }

    /**
     * 该类为单例类
     */
    private SensitiveFilter() {
    }

    public void initFilterData(WxSensitiveWordMapper wxSensitiveWordMapper) throws Exception{
        if( null == wxSensitiveWordMapper ){
            logger.error("[Sensitive Filter] : Error, cannot find bean WxSensitiveWordMapper.class");
            throw new BusinessException(SysErrorCode.SYSTEM_ERROR);
        }
        //从数据库拉去所有的敏感词,可以从其他地方加载
        List<WxSensitiveWord> wxSensitiveWords = wxSensitiveWordMapper.selectAll();
        if( null == wxSensitiveWords ){
            throw new BusinessException(SysErrorCode.EMPTY_DATA_ERROR);
        }
        for( WxSensitiveWord sensitiveWord : wxSensitiveWords ){
            put(sensitiveWord.getSensitiveWord());
        }
        logger.info("[Sensitive Filter] : load sensitive word success, [{}] records were loaded.", wxSensitiveWords.size());
    }

    /**
     * 增加一个敏感词,如果词的长度(trim后)小于2,则丢弃<br/>
     *
     * @param word
     */
    public boolean put(String word){
        if(word == null || word.trim().length() < 2){
            return false;
        }
        if(word.length() == 2 && word.matches("\\w\\w")){
            return false;
        }
        StringPointer sp = new StringPointer(word.trim());
        int hash = sp.nextTwoCharHash(0);
        int mix = sp.nextTwoCharMix(0);
        int index = hash & (sensitiveNodes.length - 1);

        SensitiveNode node = sensitiveNodes[index];
        if(node == null){
            node = new SensitiveNode(mix);
            node.words.add(sp);
            sensitiveNodes[index] = node;
        }else{
            for(;node != null; node = node.next){
                if(node.headTwoCharMix == mix){
                    node.words.add(sp);
                    return true;
                }
                if(node.next == null){
                    new SensitiveNode(mix, node).words.add(sp);
                    return true;
                }
            }
        }
        return true;
    }

    /**
     * 是否存在敏感词
     * @param sentence 待检测的句子
     * @return true:包含, false:不包含
     */
    public Boolean checkIfIncludeSensitiveWordsInSentence(String sentence){
        return filterSensitive(sentence);
    }

    /**
     * 查找出句子中包含的敏感词
     * @param sentence 待检测的句子
     * @return
     */
    public List<String> checkSensitiveWordsInSentence(String sentence){
        final List<String> list = new ArrayList<String>();
        filterSensitive(sentence, new FilterManager() {
            @Override
            public void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position) {
                list.add(sensitiveSP.toString());
            }
        });
        return list;
    }

    /**
     * 使用指定字符替换句子中的敏感词,并返回替换后的字符串
     * @param sentence 待检测的句子
     * @param character 替换的字符
     * @return
     */
    public String replaceSensitiveWordsInSentence(String sentence, final Character character){
        final List<String> list = new ArrayList<String>();
        return filterSensitive(sentence, new FilterManager() {
            @Override
            public void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position) {
                if( null != character ){
                    sentenceSP.fill(position, position + sensitiveSP.length(), character);
                }
                list.add(sensitiveSP.toString());
            }
        });
    }

    /**
     * 过滤敏感词
     * @param sentence 待过滤的句子
     * @return
     */
    public Boolean filterSensitive(String sentence){
        StringPointer sp = new StringPointer(sentence);
        int position = 0;
        while(position < sp.length() - 1){
            int step = 1;
            int hash = sp.nextTwoCharHash(position);
            SensitiveNode node = sensitiveNodes[hash & (sensitiveNodes.length - 1)];
            if(node != null){
                int mix = sp.nextTwoCharMix(position);
                for(; node != null; node = node.next){
                    if(node.headTwoCharMix == mix){
                        NavigableSet<StringPointer> desSet = node.words.headSet(sp.substring(position), true);
                        if( !desSet.isEmpty() ){
                            for(StringPointer word: desSet.descendingSet()){
                                if(sp.nextStartsWith(position, word)){
                                    //检验是否包含敏感词时不用扫描完,发现一个就认为包含
                                    return true;
                                }
                            }
                        }
                    }
                }
            }
            position += step;
        }
        return false;
    }

    /**
     * 过滤敏感词
     * @param sentence 待过滤的句子
     * @param filterManager 对敏感词的处理方式
     * @return
     */
    public String filterSensitive(String sentence, FilterManager filterManager){
        StringPointer sp = new StringPointer(sentence);
        boolean replaced = false;
        int position = 0;
        while(position < sp.length() - 1){
            int step = 1;
            int hash = sp.nextTwoCharHash(position);
            SensitiveNode node = sensitiveNodes[hash & (sensitiveNodes.length - 1)];
            if(node != null){
                int mix = sp.nextTwoCharMix(position);
                outer:
                for(; node != null; node = node.next){
                    if(node.headTwoCharMix == mix){
                        NavigableSet<StringPointer> desSet = node.words.headSet(sp.substring(position), true);
                        if( !desSet.isEmpty() ){
                            for(StringPointer word: desSet.descendingSet()){
                                if(sp.nextStartsWith(position, word)){
                                    if( null != filterManager ){
                                        filterManager.handle(sp, word, position);
                                    }
                                    step = word.length();
                                    replaced = true;
                                    break outer;
                                }
                            }
                        }
                    }
                }
            }
            position += step;
        }
        if(replaced){
            return sp.toString();
        }else{
            return sentence;
        }
    }

    /**
     * 敏感词处理接口
     */
    public interface FilterManager{
        /**
         * 处理检测出来的敏感词
         * @param sentenceSP 待检测句子
         * @param sensitiveSP 检测到的敏感词
         * @param position  敏感词在句子中的位置
         * @return
         */
        void handle(StringPointer sentenceSP, StringPointer sensitiveSP, Integer position);
    }

    public static void main(String[] args) {
        Character a = null;
        System.out.println(String.valueOf(a));
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值