C# 简陋版词汇过滤

敏感词过滤的方法有很多,我这篇是简陋版!!

实力有限,只能简单的匹配出结果无法保证效率问题

参考网上的资料,涂涂改改写下了这个类。

一 基础类

 1.1 字符树

    /// <summary>
    /// 字符树
    /// </summary>
    public class CharacterTree
    {
        /// <summary>
        /// 字符
        /// </summary>
        public char Word { get; set; }
        /// <summary>
        /// 是否末尾
        /// </summary>
        public bool IsEnd { get; set; }
        /// <summary>
        /// 子节点
        /// </summary>
        public List<CharacterTree> Child { get; set; }

        /// <summary>
        /// 返回表示当前 System.Object 的 System.String
        /// </summary>
        /// <returns></returns>
        public override string ToString()
        {
            if (this.Word == '\u0000' || char.IsWhiteSpace(this.Word))
                return base.ToString();
            string msg = string.Format("单词:{0},是否末尾:{1}", this.Word.ToString(), this.IsEnd.ToString());
            if (this.Child != null && this.Child.Count > 0)
                msg = string.Format("{0},共{1}条分支", msg, this.Child.Count);
            return msg;
        }
    }

1.2 字符信息

    /// <summary>
    /// 字符信息
    /// </summary>
    public class Character
    {
        public Character(char word, int index)
        {
            this.Word = word;
            this.Index = index;
        }

        /// <summary>
        /// 字符
        /// </summary>
        public char Word { get; set; }

        /// <summary>
        /// 字符串中出现的索引
        /// </summary>
        public int Index { get; set; }

        /// <summary>
        /// 返回表示当前 System.Object 的 System.String
        /// </summary>
        /// <returns></returns>
        public override string ToString()
        {
            if (this.Word == '\u0000' || char.IsWhiteSpace(this.Word))
                return base.ToString();
            return string.Format("字符:{0},索引:{1}", this.Word.ToString(), this.Index.ToString());
        }
    }

二 词库的初始化

        假如词库中有两个字符串["我需要你","我拿走了"],我设想词库就是一棵树,每一个字符串就是树上的树枝,字符就是上面的叶子。
        按照设想,我绘制了图一。

流程数
流程树

     字符“我”是这两个字符串数组的主干,字符“需”和字符“拿”则是分支。如果词库中再来了一些字符串,先在主干中匹配有没有,没有则创建新的主干,存在就在主干中查找分支。

三 KeySearch检索类

     提供了三个公开的方法:FindAll、Replace和IsContains,查找、替换和是否包含。其中查找的返回结果是文本中所有出现的词库词,没有对结果排除重复。

    /// <summary>
    /// 检索
    /// </summary>
    public class KeySearch
    {
        /// <summary>
        /// 
        /// </summary>
        /// <param name="arry">词库</param>
        public KeySearch(List<string> arry)
        {
            if (arry == null || arry.Count == 0)
                throw new ArgumentNullException("arry", "词库不能为空");

            this.Tree = CreateTree(arry);
        }

        #region 属性

        /// <summary>
        /// 字符树
        /// </summary>
        public List<CharacterTree> Tree { get; private set; }

        /// <summary>
        /// 是否 忽视文本空白区域
        /// </summary>
        public bool IsIgnoreWhiteSpace { get; set; }

        private string _text = string.Empty;
        /// <summary>
        /// 字符串
        /// </summary>
        public string Text
        {
            get { return _text; }
            private set
            {
                if (string.IsNullOrWhiteSpace(value))
                    throw new ArgumentNullException("text", "文本值不能为空");
                if (this.IsIgnoreWhiteSpace)
                    _text = value.Replace(" ", "");
                else
                    _text = value;
            }
        }

        #endregion

        #region 字段

        /// <summary>
        /// 关键字索引字典[临时]
        /// <para>Key:字符信息</para>
        /// <para>Value:是否末位</para>
        /// </summary>
        private Dictionary<Character, bool> tempDic = new Dictionary<Character, bool>();

        #endregion

        /// <summary>
        /// 检索与文本匹配的所有元素
        /// </summary>
        /// <param name="text">文本</param>
        /// <returns></returns>
        public List<string> FindAll(string text)
        {
            List<string> list = new List<string>();
            List<CharacterTree> treeList = this.Tree;
            this.Text = text;

            for (int i = 0; i < this.Text.Length; i++)
            {
                CharacterTree tree = FindNode(treeList, this.Text[i], i);
                if (tree == null)
                {
                    if (tempDic.Count > 0)
                    {
                        list.Add(string.Concat<char>(tempDic.Keys.Select(a=>a.Word)));
                        tempDic.Clear();
                    }
                    treeList = this.Tree;
                }
                else
                {
                    if (tree.Child == null)
                    {
                        if (tempDic.Count > 0)
                        {
                            list.Add(string.Concat<char>(tempDic.Keys.Select(a => a.Word)));
                            tempDic.Clear();
                        }
                        treeList = this.Tree;
                    }
                    else
                    {
                        treeList = tree.Child;
                    }
                }                
            }
            return list.ToList();
        }

        /// <summary>
        /// 返回一个新字符串
        /// </summary>
        /// <param name="text">文本</param>
        /// <param name="newChar">替换符号</param>
        /// <returns></returns>
        public string Replace(string text, char newChar = '*')
        {
            List<CharacterTree> treeList = this.Tree;
            this.Text = text;

            StringBuilder newString = new StringBuilder(this.Text);           

            for (int i = 0; i < this.Text.Length; i++)
            {
                char charStr = this.Text[i];
                CharacterTree tree = FindNode(treeList, charStr, i);
                if (tree == null)
                {
                    foreach (var item in tempDic.Keys)
                    {
                        newString[item.Index] = newChar;
                    }
                    tempDic.Clear();
                    treeList = this.Tree;
                }
                else
                {
                    if (tree.Child == null)
                    {
                        foreach (var item in tempDic.Keys)
                        {
                            newString[item.Index] = newChar;
                        }
                        tempDic.Clear();
                        treeList = this.Tree;
                    }
                    else
                    {
                        treeList = tree.Child;                        
                    }
                }                
            }

            return newString.ToString();
        }

        /// <summary>
        /// 文本中是否含有词库内容
        /// </summary>
        /// <param name="text">文本</param>
        /// <returns></returns>
        public bool IsContains(string text)
        {
            List<CharacterTree> treeList = this.Tree;
            this.Text = text;            

            for (int i = 0; i < this.Text.Length; i++)
            {
                CharacterTree tree = FindNode(treeList, this.Text[i], i);
                if (tree == null)
                {
                    if (tempDic.Count > 0)
                    {
                        tempDic.Clear();
                        return true;
                    }
                    treeList = this.Tree;
                }
                else
                {
                    if (tree.Child == null || (this.Text.Length - i == 1))//考虑最后一位
                    {
                        if (tempDic.Count > 0)
                        {
                            tempDic.Clear();
                            return true;
                        }
                    }
                    else
                    {
                        treeList = tree.Child;
                    }
                }
            }
            return false;
        }

        /// <summary>
        /// 寻找节点
        /// </summary>
        /// <param name="treeList">分支数据</param>
        /// <param name="text">文本</param>
        /// <param name="index">当前索引</param>
        /// <returns></returns>
        private CharacterTree FindNode(List<CharacterTree> treeList, char charStr, int index)
        {
            CharacterTree tree = treeList.FirstOrDefault(a => a.Word == charStr);
            if (tree == null)
            {
                //排除情况:词库{"真的吗","你真好"},字符串:"你真的吗",结果:"真的吗"
                KeyValuePair<Character, bool> lastDic = (tempDic.Count > 0) ? tempDic.LastOrDefault() : default(KeyValuePair<Character, bool>);//字典中最后一个元素Key
                int lastDicIndex = lastDic.Key == null ? -1 : tempDic.LastOrDefault().Key.Index;

                if (index > 0 && index - lastDicIndex == 1 && !lastDic.Value)
                {
                    CharacterTree temp = this.Tree.FirstOrDefault(a => a.Word == this.Text[index - 1]);
                    tempDic.Clear();
                    if (temp != null)
                    {
                        CharacterTree tempChild = temp.Child.FirstOrDefault(a => a.Word == charStr);
                        if (tempChild != null)
                        {
                            tempDic.Add(new Character(temp.Word, index - 1), temp.IsEnd);//追加元素
                            tempDic.Add(new Character(tempChild.Word, index), tempChild.IsEnd);
                            return tempChild;
                        }
                    }
                }
            }
            else
            {
                tempDic.Add(new Character(tree.Word, index), tree.IsEnd);
            }
            return tree;
        }

        #region 初始化

        /// <summary>
        /// 创建字符树词库
        /// </summary>
        /// <param name="arry">词库</param>
        /// <returns></returns>
        private List<CharacterTree> CreateTree(List<string> arry)
        {            
            List<CharacterTree> tree = new List<CharacterTree>();

            foreach (var word in arry)
            {
                if (string.IsNullOrEmpty(word))
                    continue;
                char cha = word[0];
                //查找树中是否有分支
                CharacterTree node = tree.Find(a => a.Word == cha);
                if (node == null)
                    tree.Add(CreateRootTree(word));
                else
                    AppendChildTree(node, word);
            }

            return tree;
        }
        
        /// <summary>
        /// 创建分支主干
        /// </summary>
        /// <param name="word">单词</param>
        /// <returns></returns>
        private CharacterTree CreateRootTree(string word)
        {
            //根节点,此节点 值为空
            CharacterTree root = new CharacterTree();
            //移动 游标[中间变量]
            CharacterTree tree = root;

            for (int i = 0; i < word.Length; i++)
            {
                CharacterTree child = new CharacterTree() { Word = word[i], IsEnd = false, Child = null };
                tree.Child = new List<CharacterTree>() { child };
                //浅拷贝问题
                tree = child;
            }
            tree.IsEnd = true;

            return root.Child.First();
        }

        /// <summary>
        /// 追加分支节点
        /// </summary>
        /// <param name="childTree">树分支</param>
        /// <param name="word">单词</param>
        private void AppendChildTree(CharacterTree childTree, string word)
        {
            //移动 游标[中间变量]
            CharacterTree tree = childTree;

            for (int i = 1; i < word.Length; i++)
            {
                char cha = word[i];//已存在分支,从第二个开始
                List<CharacterTree> child = tree.Child;

                if (child == null)
                {
                    CharacterTree node = new CharacterTree() { Word = cha, IsEnd = false, Child = null };
                    tree.Child = new List<CharacterTree>() { node };
                    tree = node;
                }
                else
                {
                    CharacterTree node = child.Find(e => e.Word == cha);
                    if (node == null)
                    {
                        node = new CharacterTree() { Word = cha, IsEnd = false, Child = null };
                        child.Add(node);
                        tree = node;
                    }
                    else
                    {
                        tree = node;
                    }
                }
            }
            tree.IsEnd = true;
        }

        #endregion        
    }

四 运行结果

    如果词库中出现长短词汇,如“你真的”和“你真”,则返回“你真的”。

   如果文本中两个词汇相邻,词库“你真”和“真的吗”,文本“你真的吗”,被替换的文本“**的吗”。

   

运行结果
运行结果

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值