Aho-Corasick算法实现C#(简单关键字过滤)

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ConsoleTest
{
    internal class Program
    {
        /// <summary>
        ///     简单关键字过滤
        /// </summary>
        /// <param name="args"></param>
        private static void Main(string[] args)
        {
            var originstr = "sswo1lfsss殺殺殺尼玛币阿三勾特朗普大蘇打阿薩伊万卡";
            var ss = CheckDirtyWords(originstr);
            if (ss != null && ss.Count > 0)
                for (var i = 0; i < ss.Count; i++)
                    originstr = originstr.Replace(ss[i].Keyword, "****");

            Console.WriteLine(originstr);
            Console.ReadLine();
        }

        /// <summary>
        ///     检查指定的内容是否包含非法关键字
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private static List<KeywordSearchResult> CheckDirtyWords(string text)
        {
            var dirtyStr = "wolf|jason|特朗普|尼玛币";
            if (string.IsNullOrEmpty(dirtyStr)) return null;

            var keywords = dirtyStr.Split('|').ToList();
            var ks = new KeywordFilter(keywords);
            return ks.FindAllKeywords(text);
        }

        //protected static bool CheckDirtyWords(string text)
        //{
        //    var dirtyStr = "wolf|jason|hoho|barry|喫屎";
        //    if (string.IsNullOrEmpty(dirtyStr))
        //    {
        //        return false;
        //    }

        //    List<string> keywords = dirtyStr.Split('|').ToList();
        //    KeywordFilter ks = new KeywordFilter(keywords);
        //    return ks.FindAllKeywords(text).Count > 0;
        //}
    }

    /// <summary>
    ///     Aho-Corasick算法实现
    /// </summary>
    public class KeywordFilter
    {
        /// <summary>
        ///     构造节点
        /// </summary>
        private class Node
        {
            private readonly Dictionary<char, Node> transDict;

            public Node(char c, Node parent)
            {
                Char = c;
                Parent = parent;
                Transitions = new List<Node>();
                Results = new List<string>();

                transDict = new Dictionary<char, Node>();
            }

            public char Char { get; }

            public Node Parent { get; }

            public Node Failure { get; set; }

            public List<Node> Transitions { get; private set; }

            public List<string> Results { get; }

            public void AddResult(string result)
            {
                if (!Results.Contains(result)) Results.Add(result);
            }

            public void AddTransition(Node node)
            {
                transDict.Add(node.Char, node);
                Transitions = transDict.Values.ToList();
            }

            public Node GetTransition(char c)
            {
                Node node;
                if (transDict.TryGetValue(c, out node)) return node;

                return null;
            }

            public bool ContainsTransition(char c)
            {
                return GetTransition(c) != null;
            }
        }

        private Node root; // 根节点
        private readonly string[] keywords; // 所有关键词

        public KeywordFilter(IEnumerable<string> keywords)
        {
            this.keywords = keywords.ToArray();
            Initialize();
        }

        /// <summary>
        ///     根据关键词来初始化所有节点
        /// </summary>
        private void Initialize()
        {
            root = new Node(' ', null);

            // 添加模式
            foreach (var k in keywords)
            {
                var n = root;
                foreach (var c in k)
                {
                    Node temp = null;
                    foreach (var tnode in n.Transitions)
                        if (tnode.Char == c)
                        {
                            temp = tnode;
                            break;
                        }

                    if (temp == null)
                    {
                        temp = new Node(c, n);
                        n.AddTransition(temp);
                    }

                    n = temp;
                }

                n.AddResult(k);
            }

            // 第一层失败指向根节点
            var nodes = new List<Node>();
            foreach (var node in root.Transitions)
            {
                // 失败指向root
                node.Failure = root;
                foreach (var trans in node.Transitions) nodes.Add(trans);
            }

            // 其它节点 BFS
            while (nodes.Count != 0)
            {
                var newNodes = new List<Node>();
                foreach (var nd in nodes)
                {
                    var r = nd.Parent.Failure;
                    var c = nd.Char;

                    while (r != null && !r.ContainsTransition(c)) r = r.Failure;

                    if (r == null)
                    {
                        // 失败指向root
                        nd.Failure = root;
                    }
                    else
                    {
                        nd.Failure = r.GetTransition(c);
                        foreach (var result in nd.Failure.Results) nd.AddResult(result);
                    }

                    foreach (var child in nd.Transitions) newNodes.Add(child);
                }

                nodes = newNodes;
            }

            // 根节点的失败指向自己
            root.Failure = root;
        }

        /// <summary>
        ///     找出所有出现过的关键词
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public List<KeywordSearchResult> FindAllKeywords(string text)
        {
            var list = new List<KeywordSearchResult>();

            var current = root;
            for (var index = 0; index < text.Length; ++index)
            {
                Node trans;
                do
                {
                    trans = current.GetTransition(text[index]);

                    if (current == root)
                        break;

                    if (trans == null) current = current.Failure;
                } while (trans == null);

                if (trans != null) current = trans;

                foreach (var s in current.Results) list.Add(new KeywordSearchResult(index - s.Length + 1, s));
            }

            return list;
        }

        /// <summary>
        ///     简单地过虑关键词
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public string FilterKeywords(string text)
        {
            var sb = new StringBuilder();

            var current = root;
            for (var index = 0; index < text.Length; index++)
            {
                Node trans;
                do
                {
                    trans = current.GetTransition(text[index]);

                    if (current == root)
                        break;

                    if (trans == null) current = current.Failure;
                } while (trans == null);

                if (trans != null) current = trans;

                // 处理字符
                if (current.Results.Count > 0)
                {
                    var first = current.Results[0];
                    sb.Remove(sb.Length - first.Length + 1, first.Length - 1); // 把匹配到的替换为**
                    sb.Append(new string('*', current.Results[0].Length));
                }
                else
                {
                    sb.Append(text[index]);
                }
            }

            return sb.ToString();
        }
    }

    /// <summary>
    ///     表示一个查找结果
    /// </summary>
    public struct KeywordSearchResult
    {
        public static readonly KeywordSearchResult Empty = new KeywordSearchResult(-1, string.Empty);

        public KeywordSearchResult(int index, string keyword)
        {
            Index = index;
            Keyword = keyword;
        }

        /// <summary>
        ///     位置
        /// </summary>
        public int Index { get; }

        /// <summary>
        ///     关键词
        /// </summary>
        public string Keyword { get; }
    }
}

如图:

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值