C#敏感词过滤算法实现

敏感词、文字过滤是一个网站必不可少的功能,如何设计一个好的、高效的过滤算法是非常有必要的。

在实现文字过滤的算法中,DFA是唯一比较好的实现算法。DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。在实现敏感词过滤的算法中,我们必须要减少运算,而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。

下面看下在c#方法下实现方式

1、构建敏感词库类

private bool LoadDictionary()
        {
            var wordList = new List<string>();
            if (_memoryLexicon == null)
            {
                _memoryLexicon = new WordGroup[char.MaxValue];
                var words = new SensitiveWordBll().GetAllWords();
                if (words == null)
                    return false;
                foreach (string word in words)
                {
                    wordList.Add(word);
                    var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,
                        Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);
                    if (word != chineseWord)
                        wordList.Add(chineseWord);
                }
                foreach (var word in wordList)
                {
                    if (word.Length > 0)
                    {
                        var group = _memoryLexicon[word[0]];
                        if (group == null)
                        {
                            group = new WordGroup();
                            _memoryLexicon[word[0]] = group;
                        }
                        group.Add(word.Substring(1));
                    }
                }
            }
            return true;
        }

2、构建敏感词检测类

private bool Check(string blackWord)
        {
            _wordlenght = 0;
            //检测源下一位游标
            _nextCursor = _cursor + 1;
            var found = false;
            var continueCheck = 0;
            //遍历词的每一位做匹配
            for (var i = 0; i < blackWord.Length; i++)
            {
                //特殊字符偏移游标
                var offset = 0;
                if (_nextCursor >= _sourceText.Length)
                {
                    if (i - 1 < blackWord.Length - 1)
                        found = false;
                    break;
                }
                else
                {
                    //检测下位字符如果不是汉字 数字 字符 偏移量加1
                    for (var y = _nextCursor; y < _sourceText.Length; y++)
                    {
                        if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))
                        {
                            offset++;
                            //避让特殊字符,下位游标如果>=字符串长度 跳出
                            if (_nextCursor + offset >= _sourceText.Length)
                                break;
                            _wordlenght++;
                        }
                        else break;
                    }
                    if (_nextCursor + offset >= _sourceText.Length)
                    {
                        found = false;
                        break;
                    }
                    if (blackWord[i] == _sourceText[_nextCursor + offset])
                    {
                        found = true;
                        continueCheck = 0;
                    }
                    else
                    {
                        // 匹配不到时尝试继续匹配4个字符
                        if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)
                        {
                            continueCheck++;
                            i--;
                        }
                        else
                        {
                            found = false;
                            break;
                        }
                    }
                }
                _nextCursor = _nextCursor + 1 + offset;
                _wordlenght++;
            }
            return found;
        }
    }

3、测试与使用方法

_illegalWords = new List<string>();
            if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))
            {
                return sourceText;
            }

            if (!string.IsNullOrEmpty(sourceText))
                _sourceText = sourceText;
            _cursor = 0;
            if (!LoadDictionary())
            {
                return _sourceText;
            }

            var tempString = _sourceText.ToCharArray();
            var sourceTextDbc = ToDBC(SourceText);
            for (var i = 0; i < SourceText.Length; i++)
            {
                //查询以该字为首字符的词组
                var group = _memoryLexicon[sourceTextDbc[i]];
                if (group != null)
                {
                    for (var z = 0; z < group.Count(); z++)
                    {
                        string word = group.GetWord(z);
                        if (word.Length == 0 || Check(word))
                        {
                            if (isFirstCheckedReturn)
                            {
                                return null;
                            }

                            var blackword = string.Empty;
                            for (var pos = 0; pos < _wordlenght + 1; pos++)
                            {
                                blackword += tempString[pos + _cursor].ToString();
                                tempString[pos + _cursor] = ReplaceChar;
                            }
                            _illegalWords.Add(blackword);

                            _cursor = _cursor + _wordlenght;
                            i = i + _wordlenght;
                            break;
                        }
                    }
                }
                _cursor++;
            }
            return new string(tempString);
var filter = new SensitiveWordFilter();
            filter.SourceText = "dddddd";
            var sourctText = filter.SourceText;
            filter.ResetMemoryLexicon();
            var datetime = DateTime.Now;
            var ss = filter.Filter();
            var datetime2 = DateTime.Now;
            var millisecond = (datetime2 - datetime).TotalMilliseconds;
            Console.WriteLine(millisecond);
            Console.WriteLine(ss);
            var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);
            var ssx = sourctText;
            var datetimex = DateTime.Now;
            foreach (var word in words)
            {
                if (word.Length > 0)
                    ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*'));
            }
            var datetime2x = DateTime.Now;
            var millisecondx = (datetime2x - datetimex).TotalMilliseconds;
            Console.WriteLine(millisecondx);
            Console.WriteLine(ssx);

技术群: 需要进技术群学习交流的请添加小编微信,切记备注:加群,对以上内容有什么疑问也可以直接和小编直接沟通交流!     

小编微信:mm1552923   

公众号:dotNet编程大全      

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值