敏感字检测or替换,并可以限定字符串长度

大概测试了一下,效率还算可以的,.
用到了log4net这个包做debug,不用的删掉就行.

using log4net;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;

class StringUtility
{
    ILog m_loger;
    //首字母ASCII码相同的的敏感字集合
    Dictionary<int, HashSet<string>> m_badWords;
    int m_wordLength = 0;//需要屏蔽的字符长度
    bool m_flag = false;//是否为屏蔽字的标记

    private static readonly object locker = new object();
    private static StringUtility instance;
    public static StringUtility Instance()
    {
        if (instance == null)
        {
            lock (locker)
            {
                instance = new StringUtility();
            }
        }
        return instance;
    }
    private StringUtility()
    {
        m_loger = LogManager.GetLogger("");
    }

    public void InitBadWords(string badWordsPath)
    {
        try
        {
            string[] words = System.IO.File.ReadAllLines(badWordsPath, System.Text.Encoding.Unicode);
            m_badWords = new Dictionary<int, HashSet<string>>(words.Length);
            //取出所有的敏感词(包括转为繁体字的变种)
            List<string> wordList = new List<string>();
            for (int i = 0; i < words.Length; i++)
            {
                string key = this.ToDBC(words[i]);
                wordList.Add(key);
                wordList.Add(ToTraditional(key));
            }
            //为敏感词数组排序
            Comparison<string> cmp = delegate (string key1, string key2)
            {
                return key1.CompareTo(key2);
            };
            wordList.Sort(cmp);
            //遍历排好序的敏感词数组,移除相同的字符(上方Add的时候,同时添加了半角小写的字符和转换为繁体字的字符)
            for (int i = wordList.Count - 1; i > 0; i--)
            {
                if (wordList[i].ToString() == wordList[i - 1].ToString())
                {
                    wordList.RemoveAt(i);
                }
            }
            //将首字母ASCII码相同的字符放进一个列表中
            for (int i = 0; i < wordList.Count; i++)
            {
                int firstCharASCII = (int)wordList[i][0];
                if (!m_badWords.ContainsKey(firstCharASCII))
                {
                    m_badWords[firstCharASCII] = new HashSet<string>();
                }
                m_badWords[firstCharASCII].Add(wordList[i]);
            }
        }
        catch (Exception ex)
        {
            m_loger.Error(ex + "文件读取失败" + badWordsPath);
        }
    }

    /// <summary>
    /// 检查名字的合法性 如果 返回值 != NullOrEmpt 则验证不通过
    /// </summary>
    /// <param name="name"></param>
    /// <param name="minLength">允许的最短字节长度</param>
    /// <param name="maxLength">允许的最长字节长度</param>
    /// <returns></returns>
    public string CheckName(string name, int maxLength = 16)
    {
        string result = "";
        if (string.IsNullOrEmpty(name))
        {
            result = "昵称不可以为空!";
            return result;
        }
        //名字不能全部都是空格
        bool isAllBlank = true;
        for (int i = 0; i < name.Length; i++)
        {
            if (name[i] != '\u0020' && name[i] != '\u3000')
            {
                isAllBlank = false;
                break;
            }
        }
        if (isAllBlank)
        {
            result = "昵称不可以为空!";
            return result;
        }
        //名字长度
        int strlen = 0;
        for (int i = 0; i < name.Length; i++)
        {
            if (name[i] >= 0 && name[i] <= 591)
            {
                strlen++;
            }
            else
            {
                strlen = strlen + 2;
            }
        }
        if (strlen > maxLength)
        {
            result = "长度不能超过16个字节(一个中文占两个字节)";
            return result;
        }
        //特殊字符验证
        for (int i = 0; i < name.Length; i++)
        {
            if ((name[i] >= 0 && name[i] <= 31) || name[i] == 127 || name[i] == '\u0081'
                || name[i] == '\u008F' || name[i] == '\u0090' || name[i] == '\u009D'
                || name[i] == '\u00A0' || name[i] == '\u00AD' || (name[i] >= 9216 && name[i] <= 9279))//删除控制字符和不能显示字符(控制字符图片等)
            {
                result = "昵称中不能包含: "+ name[i];
                return result;
            }
        }
        //敏感字
        string str = CheckOrFilterBadWord(name, false);
        if (!string.IsNullOrEmpty(str))
        {
            result = "名字中包含敏感字: " + str;
        }
        return result;
    }

    /// <summary>
    /// 检测并替换敏感字
    /// </summary>
    /// <param name="sourceText">源文字</param>
    /// <param name="filter">敏感字替换为*/返回检测到的敏感字</param>
    /// <returns></returns>
    public string CheckOrFilterBadWord(string sourceText, bool filter = true)
    {
        string resultString = "";
        char[] tempString = sourceText.ToCharArray();
        for (int i = 0; i < tempString.Length; i++)
        {
            //对每个字进行遍历
            int firstCharASCII = (int)tempString[i];
            //当前字母不在敏感词列表的首字母中,continue
            if (!m_badWords.ContainsKey(firstCharASCII))
            {
                continue;
            }
            //当前字母在敏感词列表的首字母中,依次向后遍历,如果能找到匹配的字符串,替换之
            foreach (var badword in m_badWords[firstCharASCII])
            {
                int badWorldLength = GetBadWordLength(i, badword, sourceText);
                //有符合条件的敏感字,
                if (badWorldLength > 0)
                {
                    //如果只是检测是否有敏感字,直接把这个敏感字返回,不对源文字进行处理
                    if (!filter)
                    {
                        resultString = badword;
                        break;
                    }
                    //整段敏感字都要替换掉,并且把检测的下标移到敏感字的下一位,避免重复检测
                    else
                    {
                        int nextIndex = badWorldLength + i;
                        for (int j = i; j < nextIndex; j++)
                        {
                            tempString[j] = '*';
                        }
                    }
                }
            }
        }
        if (filter)
        {
            resultString = new string(tempString);
        }
        return resultString;
    }

    /// <summary>
    /// 检测并返回需要屏蔽的字符长度  长度小于0,说明没有需要屏蔽的字
    /// </summary>
    /// <param name="startIndex">开始检测的位置</param>
    /// <param name="badword">屏蔽字</param>
    /// <param name="sourceText">源文字</param>
    /// <returns></returns>
    private int GetBadWordLength(int startIndex, string badword, string sourceText)
    {
        m_wordLength = 0;//需要屏蔽的字符长度
        m_flag = false;//是否为屏蔽字的标记
        int badWordIndex = 0;//敏感字在敏感词中的下标
        for (int i = startIndex; i < sourceText.Length; i++)
        {
            m_wordLength++;
            char key = sourceText[i];
            //跳过特殊字符
            if (!IsAlphabet(key) && !IsCHS(key) && !IsNum(key))
            {
                continue;
            }
            //字符匹配不上
            if (badword[badWordIndex] != key)
            {
                break;
            }
            badWordIndex++;
            //整段文字都匹配成功,说明这段文字是屏蔽字
            if (badWordIndex == badword.Length)
            {
                m_flag = true;
                break;
            }
        }
        //敏感字匹配不成功,需要屏蔽的字符长度归零
        if (!m_flag)
        {
            m_wordLength = 0;
        }
        return m_wordLength;
    }

    /// <summary>
    /// 转半角小写的函数(DBC case)
    /// </summary>
    /// <param name="input">任意字符串</param>
    /// <returns>半角字符串</returns>
    ///<remarks>
    ///全角空格为12288,半角空格为32
    ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
    ///</remarks>
    private string ToDBC(string input)
    {
        char[] c = input.ToCharArray();
        for (int i = 0; i < c.Length; i++)
        {
            if (c[i] == 12288)
            {
                c[i] = (char)32;
                continue;
            }
            if (c[i] > 65280 && c[i] < 65375)
                c[i] = (char)(c[i] - 65248);
        }
        return new string(c).ToLower();
    }

    /// <summary>
    /// 判断是否是中文
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool IsCHS(char character)
    {
        //  中文表意字符的范围 4E00-9FA5
        int charVal = (int)character;
        return (charVal >= 0x4e00 && charVal <= 0x9fa5);
    }

    /// <summary>
    /// 判断是否是数字
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool IsNum(char character)
    {
        int charVal = (int)character;
        return (charVal >= 48 && charVal <= 57);
    }

    /// <summary>
    /// 判断是否是字母
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool IsAlphabet(char character)
    {
        int charVal = (int)character;
        return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
    }


    private const int LOCALE_SYSTEM_DEFAULT = 0x0800;
    private const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
    private const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;

    [DllImport("kernel32", CharSet = CharSet.Auto, SetLastError = true)]
    private static extern int LCMapString(int Locale, int dwMapFlags, string lpSrcStr, int cchSrc, [Out] string lpDestStr, int cchDest);

    /// <summary>
    /// 将字符转换成简体中文
    /// </summary>
    /// <param name="source">输入要转换的字符串</param>
    /// <returns>转换完成后的字符串</returns>
    private string ToSimplified(string source)
    {
        String target = new String(' ', source.Length);
        int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_SIMPLIFIED_CHINESE, source, source.Length, target, source.Length);
        return target;
    }

    /// <summary>
    /// 讲字符转换为繁体中文
    /// </summary>
    /// <param name="source">输入要转换的字符串</param>
    /// <returns>转换完成后的字符串</returns>
    private string ToTraditional(string source)
    {
        String target = new String(' ', source.Length);
        int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_TRADITIONAL_CHINESE, source, source.Length, target, source.Length);
        return target;
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值