DFA,全称 Deterministic Finite Automaton 即确定有穷自动机:从一个状态通过一系列的事件转换到另一个状态,即 state -> event -> state。状态以及引起状态转换的事件都是可确定的,状态以及事件的数量都是可穷举的。
下面是使用C#代码进行实现的逻辑,下面贴出来整个类的实现。相互学习,勿喷。
using System.Collections;
using System.Text;
namespace WeChatCapture
{
public class WordSensitiveDFA
{
/// <summary>
/// 原始过滤词数据集
/// </summary>
public HashSet<string> sensitiveWordSet;
/// <summary>
/// 过滤词库
/// </summary>
public Hashtable sensitiveWordTable = new Hashtable();
/// <summary>
/// 构造函数
/// </summary>
/// <param name="SensitiveWordsArray">过滤词 词库 array</param>
public WordSensitiveDFA(string[] SensitiveWordsArray)
{
sensitiveWordSet = new HashSet<string>(SensitiveWordsArray);
InitSensitiveWordMap();
}
/// <summary>
/// 初始化 过滤词 词库
/// </summary>
public void InitSensitiveWordMap()
{
sensitiveWordTable = new Hashtable(sensitiveWordSet.Count);
foreach (string word in sensitiveWordSet)
{
Hashtable newTable = sensitiveWordTable;
for (int i = 0; i < word.Length; i++)
{
char w = word[i];
if (newTable.ContainsKey(w))
{
newTable = (Hashtable)newTable[w];
}
else
{
var newMap = new Hashtable();
newMap.Add("IsEnd", 0);
newTable.Add(w, newMap);
newTable = newMap;
}
if (i == word.Length - 1)
{
if (newTable.ContainsKey("IsEnd"))
{
newTable["IsEnd"] = 1;
}
else
{
newTable.Add("IsEnd", 1);
}
}
}
}
}
/// <summary>
/// 查找所有敏感词,找到则返回敏感词长度
/// </summary>
/// <param name="content">需要过滤的字符串</param>
/// <param name="startIndex">查找的起始位置</param>
/// <returns></returns>
public int SearchSensitiveWord(string content, int startIndex)
{
Hashtable newMap = sensitiveWordTable;
bool flag = false;
int len = 0;
for (int i = startIndex; i < content.Length; i++)
{
char word = content[i];
Hashtable temp = (Hashtable)newMap[word];
if (temp != null)
{
if ((int)temp["IsEnd"] == 1) flag = true;
else newMap = temp;
len++;
}
else break;
}
if (!flag) len = 0;
return len;
}
/// <summary>
/// 找到内容字符串内所有敏感词
/// </summary>
/// <param name="txt">需要过滤的文本内容</param>
/// <returns></returns>
public List<string> GetAllSensitiveWords(string txt)
{
List<string> result = new List<string>();
for (int i = 0; i < txt.Length; i++)
{
int length = SearchSensitiveWord(txt, i);
if (length > 0)
{
result.Add(txt.Substring(i, length));
i = i + length - 1;
}
}
return result;
}
/// <summary>
/// 替换 需要剔除的 敏感字 \0 char 空
/// </summary>
/// <param name="txt">需要过滤的文本内容</param>
/// <returns></returns>
public string ReplaceSensitiveWords(string txt)
{
int i = 0;
StringBuilder sbstr = new StringBuilder(txt);
while (i < txt.Length)
{
int len = SearchSensitiveWord(txt, i);
if (len > 0)
{
for (int j = 0; j < len; j++)
{
sbstr[i + j] = '\0';
}
i += len;
}
else
++i;
}
return sbstr.ToString();
}
}
}