敏感词汇过滤

[HttpPost]
public ActionResult SendMsg(string content)
{
content = SensitiveWordsHelper.SensitiveTextFilter(content,);
return Json(content);
}
//敏感词验证类
public class SensitiveWordsHelper
    {
        protected static readonly Logger _logger = LogManager.GetCurrentClassLogger();  //日志记录一下,用户上传的内容          存在哪些敏感词

        private static Dictionary<char, List<string>> _sensitiveWordsDictionary;

        /// <summary>
        /// 敏感词库(懒加载)
        /// </summary>
        private static Dictionary<char, List<string>> GetSensitiveWordsDictionary()
        {
            if (_sensitiveWordsDictionary == null)
            {
                #region 从Excel中加载敏感词
                
                var sensitiveWordList = new List<string>(17200);
                var folder = HttpContext.Current.Server.MapPath("~/Data/SensitiveWord.xlsx");   //敏感词Excel存放路径(excel工作簿存放的就是需要比对的词汇)
                var workbook = new XSSFWorkbook(folder);
                var sheet = workbook.GetSheet("敏感词");
                var firstRow = sheet.FirstRowNum;
                var lastRow = sheet.LastRowNum;

                for (var i = firstRow; i <= lastRow; i++)
                {
                    var row = sheet.GetRow(i);
                    if (row == null) continue;
                    string name;
                    try
                    {
                        name = row.GetCell(0).StringCellValue;
                    }
                    catch
                    {
                        name = row.GetCell(0).NumericCellValue.ToString();
                    }

                    if (!string.IsNullOrWhiteSpace(name))
                    {
                        sensitiveWordList.Add(name);
                    }
                }
                sheet = null;
                workbook.Close();
                workbook = null;

                #endregion

                #region 初始化敏感词库

                _sensitiveWordsDictionary = new Dictionary<char, List<string>>();

                //根据首字母建立字典
                foreach (var item in sensitiveWordList)
                {
                    if (!string.IsNullOrWhiteSpace(item))
                    {
                        char value = item[0];
                        if (_sensitiveWordsDictionary.ContainsKey(value))
                            _sensitiveWordsDictionary[value].Add(item);
                        else
                            _sensitiveWordsDictionary.Add(value, new List<string>() { item });
                    }
                }
                #endregion

                sensitiveWordList = null;

            }

            return _sensitiveWordsDictionary;
        }

        /// <summary>
        /// 敏感文字过滤
        /// </summary>
        /// <param name="text">需要进行敏感词汇过滤的文字</param>  
        /// <returns></returns>
        public static string SensitiveTextFilter(string text)
        {
            if (text == null)
                return null;
                
            //var srcText = text;
            
            var htmlTagRegex = new Regex("<[^>]*>");

            var hasHtmlTag = htmlTagRegex.IsMatch(text);

            //text = StringHelper.RemoveHtmlTag(text);

            StringBuilder sb = new StringBuilder(text.Length);
            var sensitiveWordDict = GetSensitiveWordsDictionary();

            StringBuilder filterLogSb = new StringBuilder();

            int textLength = text.Length;
            for (int i = 0; i < textLength; i++)
            {
                char tChar = text[i];
                if (sensitiveWordDict.ContainsKey(tChar))//如果在字典表中存在这个key
                {
                    int num = 0;//是否找到匹配的关键字 1找到0未找到
                    var charWordList = sensitiveWordDict[tChar].OrderBy(g => g.Length);//把该key的字典集合按 字符数排序(方便下面从少往多截取字符串查找)
                    foreach (var wordItem in charWordList)
                    {
                        var wordItemLength = wordItem.Length;
                        if (i + wordItemLength <= textLength)//如果需截取的字符串的索引小于总长度 则执行截取
                        {
                            string result = text.Substring(i, wordItemLength);
                            //根据关键字长度往后截取相同的字符数进行比较
                            if (result == wordItem)
                            {
                                //找到了敏感词

                                #region 如果当前这个敏感词在一个html标签里面,则不过滤

                                bool isTextInsideAHtmlTag = false;

                                if (hasHtmlTag)
                                {
                                    isTextInsideAHtmlTag = IsTextInsideAHtmlTag(text, wordItem, i);
                                }

                                if(isTextInsideAHtmlTag)
                                {
                                    continue;
                                }

                                #endregion

                                num = 1;
                                sb.Append(GetStarwordString(wordItemLength));

                                filterLogSb.Append(" ");
                                filterLogSb.Append(wordItem);

                                i = i + wordItemLength - 1;
                                //比较成功 同时改变i的索引
                                break;
                            }
                        }
                    }
                    if (num == 0)
                        sb.Append(tChar);
                }
                else
                    sb.Append(tChar);
            }

            var filterWords = filterLogSb.ToString();

            if (filterLogSb.Length > 0)
            {
                _logger.Info($"敏感词过滤:{filterWords}, 上下文:{text}");
            }


            return sb.ToString();
        }

        /// <summary>
        /// 判断一段文字是否被一个html标签包着
        /// </summary>
        /// <param name="text">整段富文本</param>
        /// <param name="sensitiveWord">敏感词</param>
        /// <param name="i">敏感词 在 整段富文本 中的位置</param>
        private static bool IsTextInsideAHtmlTag(string text, string sensitiveWord, int i)
        {
            //这种情况下不过滤: "<..."我是敏感词"...>"
            bool isFindStartTag = false;
            bool isFindEndTag = false;

            //往前找到"<"标签
            for (int j = i - 1; j >= 0; j--)
            {
                char tmpChar = text[j];
                if (tmpChar == '>')
                {
                    //如果找到'>',说明一定在是:“>..."我是敏感词"” 这种情况,那一定是不在标签内部
                    return false;
                }

                if (tmpChar == '<')
                {
                    //找到了开头'<'
                    isFindStartTag = true;
                    break;
                }
            }

            if(!isFindStartTag)
            {
                //没找到开头'<',肯定不再标签内部
                return false;
            }

            //往后找到">"标签

            var textLength = text.Length;

            for (int j = i + sensitiveWord.Length; j < textLength; j++)
            {
                char tmpChar = text[j];
                if (tmpChar == '<')
                {
                    //如果找到'<',说明一定在是:“"我是敏感词"...<” 这种情况,那一定是不在标签内部
                    return false;
                }

                if (tmpChar == '>')
                {
                    //找到了结尾'<'
                    isFindEndTag = true;
                    break;
                }
            }

            return isFindEndTag && isFindStartTag;//找到开头'<' 并且 找到了 结尾'<'
        }

        /// <summary>
        /// 替换星号
        /// </summary>
        /// <param name="value"></param>
        /// <returns></returns>
        private static string GetStarwordString(int length)
        {
            string starNum = string.Empty;
            for (int i = 0; i < length; i++)
            {
                starNum += "*";
            }
            return starNum;
        }
    }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值