.NET上Aho-Corasick字符串匹配算法的高效快速实现

AhoCorasickTree.cs

using System;
using System.Collections.Generic;
using System.Linq;

namespace AhoCorasick.Net
{
    public class AhoCorasickTree
    {
        private readonly AhoCorasickTreeNode _rootNode;

        public AhoCorasickTree(string[] keywords)
        {
            if (keywords == null) throw new ArgumentNullException("keywords");
            if (keywords.Length == 0) throw new ArgumentException("should contain keywords");

            _rootNode = new AhoCorasickTreeNode();

            var length = keywords.Length;
            for (var i = 0; i < length; i++)
            {
                AddPatternToTree(keywords[i]);
            }

            SetFailures();
        }

        public bool Contains(string text)
        {
            var currentNode = _rootNode;

            var length = text.Length;
            for (var i = 0; i < length; i++)
            {
                while (true)
                {
                    var node = currentNode.GetNode(text[i]);
                    if (node == null)
                    {
                        currentNode = currentNode.Failure;
                        if (currentNode == _rootNode)
                        {
                            break;
                        }
                    }
                    else
                    {
                        if (node.IsFinished)
                        {
                            return true;
                        }

                        currentNode = node;
                        break;
                    }
                }
            }

            return false;
        }

        // todo copy paste from Contains method: Refactor!
        // todo check performance 
        public IEnumerable<KeyValuePair<string, int>> Search(string text)
        {
            var currentNode = _rootNode;

            var length = text.Length;
            for (var i = 0; i < length; i++)
            {
                while (true)
                {
                    var node = currentNode.GetNode(text[i]);
                    if (node == null)
                    {
                        currentNode = currentNode.Failure;
                        if (currentNode == _rootNode)
                        {
                            // try to continue with the same pattern
                            node = currentNode.GetNode(text[i]);
                            if (node == null)
                            {
                                break;
                            }
                        }
                    }
                    else
                    {
                        if (node.IsFinished)
                        {
                            foreach (var result in node.Results)
                            {
                                yield return new KeyValuePair<string, int>(result, i - result.Length + 1);
                            }
                        }

                        currentNode = node;
                        break;
                    }
                }
            }
        }

        private void AddPatternToTree(string pattern)
        {
            var latestNode = _rootNode;
            var length = pattern.Length;
            for (var i = 0; i < length; i++)
            {
                latestNode = latestNode.GetNode(pattern[i])
                             ?? latestNode.AddNode(pattern[i]);
            }

            latestNode.IsFinished = true;
            latestNode.Results.Add(pattern);
        }

        private void SetFailures()
        {
            _rootNode.Failure = _rootNode;
            var queue = new Queue<AhoCorasickTreeNode>();
            queue.Enqueue(_rootNode);

            while (queue.Count > 0)
            {
                var currentNode = queue.Dequeue();
                foreach (var node in currentNode.Nodes)
                {
                    queue.Enqueue(node);
                }

                if (currentNode == _rootNode)
                {
                    continue;
                }

                var failure = currentNode.Parent.Failure;
                var key = currentNode.Key;
                while (failure.GetNode(key) == null && failure != _rootNode)
                {
                    failure = failure.Failure;
                }

                failure = failure.GetNode(key);
                if (failure == null || failure == currentNode)
                {
                    failure = _rootNode;
                }

                currentNode.Failure = failure;
                if (!currentNode.IsFinished)
                {
                    currentNode.IsFinished = failure.IsFinished;
                }

                if (currentNode.IsFinished && failure.IsFinished)
                {
                    currentNode.Results.AddRange(failure.Results);
                }
            }
        }

        private class AhoCorasickTreeNode
        {
            public readonly AhoCorasickTreeNode Parent;
            public AhoCorasickTreeNode Failure;
            public bool IsFinished;
            public List<string> Results;
            public readonly char Key;

            private int[] _buckets;
            private int _count;
            private Entry[] _entries;

            internal AhoCorasickTreeNode()
                : this(null, ' ')
            {
            }

            private AhoCorasickTreeNode(AhoCorasickTreeNode parent, char key)
            {
                Key = key;
                Parent = parent;

                _buckets = new int[0];
                _entries = new Entry[0];
                Results = new List<string>();
            }

            public AhoCorasickTreeNode[] Nodes
            {
                get { return _entries.Select(x => x.Value).ToArray(); }
            }

            public AhoCorasickTreeNode AddNode(char key)
            {
                var node = new AhoCorasickTreeNode(this, key);

                var newSize = _count + 1;
                Resize(newSize);

                var targetBucket = key % newSize;
                _entries[_count].Key = key;
                _entries[_count].Value = node;
                _entries[_count].Next = _buckets[targetBucket];
                _buckets[targetBucket] = _count;
                _count++;

                return node;
            }

            public AhoCorasickTreeNode GetNode(char key)
            {
                if (_count == 0) return null;

                var bucketIndex = key % _count;
                for (var i = _buckets[bucketIndex]; i >= 0; i = _entries[i].Next)
                {
                    if (_entries[i].Key == key)
                    {
                        return _entries[i].Value;
                    }
                }

                return null;
            }

            private void Resize(int newSize)
            {
                var newBuckets = new int[newSize];
                for (var i = 0; i < newSize; i++)
                {
                    newBuckets[i] = -1;
                }

                var newEntries = new Entry[newSize];
                Array.Copy(_entries, 0, newEntries, 0, _entries.Length);

                // rebalancing buckets for existing entries
                for (var i = 0; i < _entries.Length; i++)
                {
                    var bucket = newEntries[i].Key % newSize;
                    newEntries[i].Next = newBuckets[bucket];
                    newBuckets[bucket] = i;
                }

                _buckets = newBuckets;
                _entries = newEntries;
            }

            private struct Entry
            {
                public char Key;
                public int Next;
                public AhoCorasickTreeNode Value;
            }
        }
    }
}

使用示例

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using AhoCorasick.Net;

namespace Test
{
    class Program
    {
        static void Main(string[] args)
        {
            var sut = new AhoCorasickTree(new[] { "ab", "abc", "bcd" });
            Console.WriteLine(sut.Contains("d"));
            Console.WriteLine(sut.Contains("bce"));
            Console.WriteLine(sut.Contains("bcd"));
            Console.WriteLine(sut.Contains("abcd"));

            //---------------------------------------------
            //测试二
            //var keywords = new AhoCorasickTree(new[] { "Mozilla", "6.3", "KHTML", "someKeyword"});
            //var userAgent =
            //    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36";
            //var keywordsPositions = keywords.Search(userAgent).ToList();
            //Console.WriteLine(keywordsPositions.Count);
            //Console.WriteLine(keywords.Contains("MozillasomeKeyword"));
            
            //----------------------------------------------------
            var keywords = new AhoCorasickTree(new[] { "a", "ab", "bab", "bc", "bca", "c", "caa" });
            var keywordsPositions = keywords.Search("abccab").ToList();
            Console.Write(keywordsPositions.Count);
            Console.ReadKey();
        }
    }

    public class SmallTreeBenchmark
    {
        private const int LengthOfKeyword = 8;
        private const int NumberOfKeywords = 100;
        private readonly string _keyword;
        private readonly AhoCorasickTree _sut;

        public SmallTreeBenchmark()
        {
            var randomString = new RandomString();
            var keywords = new string[NumberOfKeywords];
            for (int i = 0; i < NumberOfKeywords; i++)
            {
                keywords[i] = randomString.GetRandomString(LengthOfKeyword);
            }
            _sut = new AhoCorasickTree(keywords);

            _keyword = keywords[0];
        }

        public bool ContainsSmallWord()
        {
            return _sut.Contains(_keyword);
        }

    }


    public class RandomString
    {
        private const string Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
        private readonly Random _random = new Random();

        public string GetRandomString(int length)
        {
            var stringChars = new char[length];

            for (int i = 0; i < stringChars.Length; i++)
            {
                stringChars[i] = Chars[_random.Next(Chars.Length)];
            }

            return new string(stringChars);
        }
    }
}

如图:

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值