敏感词、文字过滤是一个网站必不可少的功能,如何设计一个好的、高效的过滤算法是非常有必要的。
在实现文字过滤的算法中,DFA是唯一比较好的实现算法。DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。在实现敏感词过滤的算法中,我们必须要减少运算,而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。
下面看下在c#方法下实现方式
1、构建敏感词库类
private bool LoadDictionary()
{
var wordList = new List<string>();
if (_memoryLexicon == null)
{
_memoryLexicon = new WordGroup[char.MaxValue];
var words = new SensitiveWordBll().GetAllWords();
if (words == null)
return false;
foreach (string word in words)
{
wordList.Add(word);
var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,
Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);
if (word != chineseWord)
wordList.Add(chineseWord);
}
foreach (var word in wordList)
{
if (word.Length > 0)
{
var group = _memoryLexicon[word[0]];
if (group == null)
{
group = new WordGroup();
_memoryLexicon[word[0]] = group;
}
group.Add(word.Substring(1));
}
}
}
return true;
}
2、构建敏感词检测类
private bool Check(string blackWord)
{
_wordlenght = 0;
//检测源下一位游标
_nextCursor = _cursor + 1;
var found = false;
var continueCheck = 0;
//遍历词的每一位做匹配
for (var i = 0; i < blackWord.Length; i++)
{
//特殊字符偏移游标
var offset = 0;
if (_nextCursor >= _sourceText.Length)
{
if (i - 1 < blackWord.Length - 1)
found = false;
break;
}
else
{
//检测下位字符如果不是汉字 数字 字符 偏移量加1
for (var y = _nextCursor; y < _sourceText.Length; y++)
{
if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))
{
offset++;
//避让特殊字符,下位游标如果>=字符串长度 跳出
if (_nextCursor + offset >= _sourceText.Length)
break;
_wordlenght++;
}
else break;
}
if (_nextCursor + offset >= _sourceText.Length)
{
found = false;
break;
}
if (blackWord[i] == _sourceText[_nextCursor + offset])
{
found = true;
continueCheck = 0;
}
else
{
// 匹配不到时尝试继续匹配4个字符
if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)
{
continueCheck++;
i--;
}
else
{
found = false;
break;
}
}
}
_nextCursor = _nextCursor + 1 + offset;
_wordlenght++;
}
return found;
}
}
3、测试与使用方法
_illegalWords = new List<string>();
if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))
{
return sourceText;
}
if (!string.IsNullOrEmpty(sourceText))
_sourceText = sourceText;
_cursor = 0;
if (!LoadDictionary())
{
return _sourceText;
}
var tempString = _sourceText.ToCharArray();
var sourceTextDbc = ToDBC(SourceText);
for (var i = 0; i < SourceText.Length; i++)
{
//查询以该字为首字符的词组
var group = _memoryLexicon[sourceTextDbc[i]];
if (group != null)
{
for (var z = 0; z < group.Count(); z++)
{
string word = group.GetWord(z);
if (word.Length == 0 || Check(word))
{
if (isFirstCheckedReturn)
{
return null;
}
var blackword = string.Empty;
for (var pos = 0; pos < _wordlenght + 1; pos++)
{
blackword += tempString[pos + _cursor].ToString();
tempString[pos + _cursor] = ReplaceChar;
}
_illegalWords.Add(blackword);
_cursor = _cursor + _wordlenght;
i = i + _wordlenght;
break;
}
}
}
_cursor++;
}
return new string(tempString);
var filter = new SensitiveWordFilter();
filter.SourceText = "dddddd";
var sourctText = filter.SourceText;
filter.ResetMemoryLexicon();
var datetime = DateTime.Now;
var ss = filter.Filter();
var datetime2 = DateTime.Now;
var millisecond = (datetime2 - datetime).TotalMilliseconds;
Console.WriteLine(millisecond);
Console.WriteLine(ss);
var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);
var ssx = sourctText;
var datetimex = DateTime.Now;
foreach (var word in words)
{
if (word.Length > 0)
ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*'));
}
var datetime2x = DateTime.Now;
var millisecondx = (datetime2x - datetimex).TotalMilliseconds;
Console.WriteLine(millisecondx);
Console.WriteLine(ssx);
技术群: 需要进技术群学习交流的请添加小编微信,切记备注:加群,对以上内容有什么疑问也可以直接和小编直接沟通交流!
小编微信:mm1552923
公众号:dotNet编程大全