AhoCorasickTree.cs
using System;
using System.Collections.Generic;
using System.Linq;
namespace AhoCorasick.Net
{
public class AhoCorasickTree
{
private readonly AhoCorasickTreeNode _rootNode;
public AhoCorasickTree(string[] keywords)
{
if (keywords == null) throw new ArgumentNullException("keywords");
if (keywords.Length == 0) throw new ArgumentException("should contain keywords");
_rootNode = new AhoCorasickTreeNode();
var length = keywords.Length;
for (var i = 0; i < length; i++)
{
AddPatternToTree(keywords[i]);
}
SetFailures();
}
public bool Contains(string text)
{
var currentNode = _rootNode;
var length = text.Length;
for (var i = 0; i < length; i++)
{
while (true)
{
var node = currentNode.GetNode(text[i]);
if (node == null)
{
currentNode = currentNode.Failure;
if (currentNode == _rootNode)
{
break;
}
}
else
{
if (node.IsFinished)
{
return true;
}
currentNode = node;
break;
}
}
}
return false;
}
public IEnumerable<KeyValuePair<string, int>> Search(string text)
{
var currentNode = _rootNode;
var length = text.Length;
for (var i = 0; i < length; i++)
{
while (true)
{
var node = currentNode.GetNode(text[i]);
if (node == null)
{
currentNode = currentNode.Failure;
if (currentNode == _rootNode)
{
node = currentNode.GetNode(text[i]);
if (node == null)
{
break;
}
}
}
else
{
if (node.IsFinished)
{
foreach (var result in node.Results)
{
yield return new KeyValuePair<string, int>(result, i - result.Length + 1);
}
}
currentNode = node;
break;
}
}
}
}
private void AddPatternToTree(string pattern)
{
var latestNode = _rootNode;
var length = pattern.Length;
for (var i = 0; i < length; i++)
{
latestNode = latestNode.GetNode(pattern[i])
?? latestNode.AddNode(pattern[i]);
}
latestNode.IsFinished = true;
latestNode.Results.Add(pattern);
}
private void SetFailures()
{
_rootNode.Failure = _rootNode;
var queue = new Queue<AhoCorasickTreeNode>();
queue.Enqueue(_rootNode);
while (queue.Count > 0)
{
var currentNode = queue.Dequeue();
foreach (var node in currentNode.Nodes)
{
queue.Enqueue(node);
}
if (currentNode == _rootNode)
{
continue;
}
var failure = currentNode.Parent.Failure;
var key = currentNode.Key;
while (failure.GetNode(key) == null && failure != _rootNode)
{
failure = failure.Failure;
}
failure = failure.GetNode(key);
if (failure == null || failure == currentNode)
{
failure = _rootNode;
}
currentNode.Failure = failure;
if (!currentNode.IsFinished)
{
currentNode.IsFinished = failure.IsFinished;
}
if (currentNode.IsFinished && failure.IsFinished)
{
currentNode.Results.AddRange(failure.Results);
}
}
}
private class AhoCorasickTreeNode
{
public readonly AhoCorasickTreeNode Parent;
public AhoCorasickTreeNode Failure;
public bool IsFinished;
public List<string> Results;
public readonly char Key;
private int[] _buckets;
private int _count;
private Entry[] _entries;
internal AhoCorasickTreeNode()
: this(null, ' ')
{
}
private AhoCorasickTreeNode(AhoCorasickTreeNode parent, char key)
{
Key = key;
Parent = parent;
_buckets = new int[0];
_entries = new Entry[0];
Results = new List<string>();
}
public AhoCorasickTreeNode[] Nodes
{
get { return _entries.Select(x => x.Value).ToArray(); }
}
public AhoCorasickTreeNode AddNode(char key)
{
var node = new AhoCorasickTreeNode(this, key);
var newSize = _count + 1;
Resize(newSize);
var targetBucket = key % newSize;
_entries[_count].Key = key;
_entries[_count].Value = node;
_entries[_count].Next = _buckets[targetBucket];
_buckets[targetBucket] = _count;
_count++;
return node;
}
public AhoCorasickTreeNode GetNode(char key)
{
if (_count == 0) return null;
var bucketIndex = key % _count;
for (var i = _buckets[bucketIndex]; i >= 0; i = _entries[i].Next)
{
if (_entries[i].Key == key)
{
return _entries[i].Value;
}
}
return null;
}
private void Resize(int newSize)
{
var newBuckets = new int[newSize];
for (var i = 0; i < newSize; i++)
{
newBuckets[i] = -1;
}
var newEntries = new Entry[newSize];
Array.Copy(_entries, 0, newEntries, 0, _entries.Length);
for (var i = 0; i < _entries.Length; i++)
{
var bucket = newEntries[i].Key % newSize;
newEntries[i].Next = newBuckets[bucket];
newBuckets[bucket] = i;
}
_buckets = newBuckets;
_entries = newEntries;
}
private struct Entry
{
public char Key;
public int Next;
public AhoCorasickTreeNode Value;
}
}
}
}
使用示例
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using AhoCorasick.Net;
namespace Test
{
class Program
{
static void Main(string[] args)
{
var sut = new AhoCorasickTree(new[] { "ab", "abc", "bcd" });
Console.WriteLine(sut.Contains("d"));
Console.WriteLine(sut.Contains("bce"));
Console.WriteLine(sut.Contains("bcd"));
Console.WriteLine(sut.Contains("abcd"));
var keywords = new AhoCorasickTree(new[] { "a", "ab", "bab", "bc", "bca", "c", "caa" });
var keywordsPositions = keywords.Search("abccab").ToList();
Console.Write(keywordsPositions.Count);
Console.ReadKey();
}
}
public class SmallTreeBenchmark
{
private const int LengthOfKeyword = 8;
private const int NumberOfKeywords = 100;
private readonly string _keyword;
private readonly AhoCorasickTree _sut;
public SmallTreeBenchmark()
{
var randomString = new RandomString();
var keywords = new string[NumberOfKeywords];
for (int i = 0; i < NumberOfKeywords; i++)
{
keywords[i] = randomString.GetRandomString(LengthOfKeyword);
}
_sut = new AhoCorasickTree(keywords);
_keyword = keywords[0];
}
public bool ContainsSmallWord()
{
return _sut.Contains(_keyword);
}
}
public class RandomString
{
private const string Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
private readonly Random _random = new Random();
public string GetRandomString(int length)
{
var stringChars = new char[length];
for (int i = 0; i < stringChars.Length; i++)
{
stringChars[i] = Chars[_random.Next(Chars.Length)];
}
return new string(stringChars);
}
}
}
如图: