敏感词过滤的方法有很多,我这篇是简陋版!!
实力有限,只能简单的匹配出结果,无法保证效率问题!
参考网上的资料,涂涂改改写下了这个类。
一 基础类
1.1 字符树
/// <summary>
/// 字符树
/// </summary>
public class CharacterTree
{
/// <summary>
/// 字符
/// </summary>
public char Word { get; set; }
/// <summary>
/// 是否末尾
/// </summary>
public bool IsEnd { get; set; }
/// <summary>
/// 子节点
/// </summary>
public List<CharacterTree> Child { get; set; }
/// <summary>
/// 返回表示当前 System.Object 的 System.String
/// </summary>
/// <returns></returns>
public override string ToString()
{
if (this.Word == '\u0000' || char.IsWhiteSpace(this.Word))
return base.ToString();
string msg = string.Format("单词:{0},是否末尾:{1}", this.Word.ToString(), this.IsEnd.ToString());
if (this.Child != null && this.Child.Count > 0)
msg = string.Format("{0},共{1}条分支", msg, this.Child.Count);
return msg;
}
}
1.2 字符信息
/// <summary>
/// 字符信息
/// </summary>
public class Character
{
public Character(char word, int index)
{
this.Word = word;
this.Index = index;
}
/// <summary>
/// 字符
/// </summary>
public char Word { get; set; }
/// <summary>
/// 字符串中出现的索引
/// </summary>
public int Index { get; set; }
/// <summary>
/// 返回表示当前 System.Object 的 System.String
/// </summary>
/// <returns></returns>
public override string ToString()
{
if (this.Word == '\u0000' || char.IsWhiteSpace(this.Word))
return base.ToString();
return string.Format("字符:{0},索引:{1}", this.Word.ToString(), this.Index.ToString());
}
}
二 词库的初始化
假如词库中有两个字符串["我需要你","我拿走了"],我设想词库就是一棵树,每一个字符串就是树上的树枝,字符就是上面的叶子。
按照设想,我绘制了图一。
字符“我”是这两个字符串数组的主干,字符“需”和字符“拿”则是分支。如果词库中再来了一些字符串,先在主干中匹配有没有,没有则创建新的主干,存在就在主干中查找分支。
三 KeySearch检索类
提供了三个公开的方法:FindAll、Replace和IsContains,查找、替换和是否包含。其中查找的返回结果是文本中所有出现的词库词,没有对结果排除重复。
/// <summary>
/// 检索
/// </summary>
public class KeySearch
{
/// <summary>
///
/// </summary>
/// <param name="arry">词库</param>
public KeySearch(List<string> arry)
{
if (arry == null || arry.Count == 0)
throw new ArgumentNullException("arry", "词库不能为空");
this.Tree = CreateTree(arry);
}
#region 属性
/// <summary>
/// 字符树
/// </summary>
public List<CharacterTree> Tree { get; private set; }
/// <summary>
/// 是否 忽视文本空白区域
/// </summary>
public bool IsIgnoreWhiteSpace { get; set; }
private string _text = string.Empty;
/// <summary>
/// 字符串
/// </summary>
public string Text
{
get { return _text; }
private set
{
if (string.IsNullOrWhiteSpace(value))
throw new ArgumentNullException("text", "文本值不能为空");
if (this.IsIgnoreWhiteSpace)
_text = value.Replace(" ", "");
else
_text = value;
}
}
#endregion
#region 字段
/// <summary>
/// 关键字索引字典[临时]
/// <para>Key:字符信息</para>
/// <para>Value:是否末位</para>
/// </summary>
private Dictionary<Character, bool> tempDic = new Dictionary<Character, bool>();
#endregion
/// <summary>
/// 检索与文本匹配的所有元素
/// </summary>
/// <param name="text">文本</param>
/// <returns></returns>
public List<string> FindAll(string text)
{
List<string> list = new List<string>();
List<CharacterTree> treeList = this.Tree;
this.Text = text;
for (int i = 0; i < this.Text.Length; i++)
{
CharacterTree tree = FindNode(treeList, this.Text[i], i);
if (tree == null)
{
if (tempDic.Count > 0)
{
list.Add(string.Concat<char>(tempDic.Keys.Select(a=>a.Word)));
tempDic.Clear();
}
treeList = this.Tree;
}
else
{
if (tree.Child == null)
{
if (tempDic.Count > 0)
{
list.Add(string.Concat<char>(tempDic.Keys.Select(a => a.Word)));
tempDic.Clear();
}
treeList = this.Tree;
}
else
{
treeList = tree.Child;
}
}
}
return list.ToList();
}
/// <summary>
/// 返回一个新字符串
/// </summary>
/// <param name="text">文本</param>
/// <param name="newChar">替换符号</param>
/// <returns></returns>
public string Replace(string text, char newChar = '*')
{
List<CharacterTree> treeList = this.Tree;
this.Text = text;
StringBuilder newString = new StringBuilder(this.Text);
for (int i = 0; i < this.Text.Length; i++)
{
char charStr = this.Text[i];
CharacterTree tree = FindNode(treeList, charStr, i);
if (tree == null)
{
foreach (var item in tempDic.Keys)
{
newString[item.Index] = newChar;
}
tempDic.Clear();
treeList = this.Tree;
}
else
{
if (tree.Child == null)
{
foreach (var item in tempDic.Keys)
{
newString[item.Index] = newChar;
}
tempDic.Clear();
treeList = this.Tree;
}
else
{
treeList = tree.Child;
}
}
}
return newString.ToString();
}
/// <summary>
/// 文本中是否含有词库内容
/// </summary>
/// <param name="text">文本</param>
/// <returns></returns>
public bool IsContains(string text)
{
List<CharacterTree> treeList = this.Tree;
this.Text = text;
for (int i = 0; i < this.Text.Length; i++)
{
CharacterTree tree = FindNode(treeList, this.Text[i], i);
if (tree == null)
{
if (tempDic.Count > 0)
{
tempDic.Clear();
return true;
}
treeList = this.Tree;
}
else
{
if (tree.Child == null || (this.Text.Length - i == 1))//考虑最后一位
{
if (tempDic.Count > 0)
{
tempDic.Clear();
return true;
}
}
else
{
treeList = tree.Child;
}
}
}
return false;
}
/// <summary>
/// 寻找节点
/// </summary>
/// <param name="treeList">分支数据</param>
/// <param name="text">文本</param>
/// <param name="index">当前索引</param>
/// <returns></returns>
private CharacterTree FindNode(List<CharacterTree> treeList, char charStr, int index)
{
CharacterTree tree = treeList.FirstOrDefault(a => a.Word == charStr);
if (tree == null)
{
//排除情况:词库{"真的吗","你真好"},字符串:"你真的吗",结果:"真的吗"
KeyValuePair<Character, bool> lastDic = (tempDic.Count > 0) ? tempDic.LastOrDefault() : default(KeyValuePair<Character, bool>);//字典中最后一个元素Key
int lastDicIndex = lastDic.Key == null ? -1 : tempDic.LastOrDefault().Key.Index;
if (index > 0 && index - lastDicIndex == 1 && !lastDic.Value)
{
CharacterTree temp = this.Tree.FirstOrDefault(a => a.Word == this.Text[index - 1]);
tempDic.Clear();
if (temp != null)
{
CharacterTree tempChild = temp.Child.FirstOrDefault(a => a.Word == charStr);
if (tempChild != null)
{
tempDic.Add(new Character(temp.Word, index - 1), temp.IsEnd);//追加元素
tempDic.Add(new Character(tempChild.Word, index), tempChild.IsEnd);
return tempChild;
}
}
}
}
else
{
tempDic.Add(new Character(tree.Word, index), tree.IsEnd);
}
return tree;
}
#region 初始化
/// <summary>
/// 创建字符树词库
/// </summary>
/// <param name="arry">词库</param>
/// <returns></returns>
private List<CharacterTree> CreateTree(List<string> arry)
{
List<CharacterTree> tree = new List<CharacterTree>();
foreach (var word in arry)
{
if (string.IsNullOrEmpty(word))
continue;
char cha = word[0];
//查找树中是否有分支
CharacterTree node = tree.Find(a => a.Word == cha);
if (node == null)
tree.Add(CreateRootTree(word));
else
AppendChildTree(node, word);
}
return tree;
}
/// <summary>
/// 创建分支主干
/// </summary>
/// <param name="word">单词</param>
/// <returns></returns>
private CharacterTree CreateRootTree(string word)
{
//根节点,此节点 值为空
CharacterTree root = new CharacterTree();
//移动 游标[中间变量]
CharacterTree tree = root;
for (int i = 0; i < word.Length; i++)
{
CharacterTree child = new CharacterTree() { Word = word[i], IsEnd = false, Child = null };
tree.Child = new List<CharacterTree>() { child };
//浅拷贝问题
tree = child;
}
tree.IsEnd = true;
return root.Child.First();
}
/// <summary>
/// 追加分支节点
/// </summary>
/// <param name="childTree">树分支</param>
/// <param name="word">单词</param>
private void AppendChildTree(CharacterTree childTree, string word)
{
//移动 游标[中间变量]
CharacterTree tree = childTree;
for (int i = 1; i < word.Length; i++)
{
char cha = word[i];//已存在分支,从第二个开始
List<CharacterTree> child = tree.Child;
if (child == null)
{
CharacterTree node = new CharacterTree() { Word = cha, IsEnd = false, Child = null };
tree.Child = new List<CharacterTree>() { node };
tree = node;
}
else
{
CharacterTree node = child.Find(e => e.Word == cha);
if (node == null)
{
node = new CharacterTree() { Word = cha, IsEnd = false, Child = null };
child.Add(node);
tree = node;
}
else
{
tree = node;
}
}
}
tree.IsEnd = true;
}
#endregion
}
四 运行结果
如果词库中出现长短词汇,如“你真的”和“你真”,则返回“你真的”。
如果文本中两个词汇相邻,词库“你真”和“真的吗”,文本“你真的吗”,被替换的文本“**的吗”。