一个字符串搜索的Aho-Corasick算法

Aho和Corasick对KMP算法(Knuth–Morris–Pratt algorithm)进行了改进,Aho-Corasick算法(Aho-Corasick algorithm)利用构建树,总时间复杂度是O(n)。原理图如下(摘自Aho-Corasick string matching in C#):

Building of the keyword tree (figure 1 - after the first step, figure 2 - tree with the fail function)

C#版本的实现代码可以从Aho-Corasick string matching in C#得到,也可以点击这里获得该算法的PDF文档。

这是一个应用示例:

预览图

它能将载入的RTF文档中的搜索关键字高亮,检索速度较快,示例没有实现全字匹配,算法代码简要如下:

/* Aho-Corasick text search algorithm implementation * * For more information visit * - http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf */ using System; using System.Collections; namespace EeekSoft.Text { /// <summary> /// Interface containing all methods to be implemented /// by string search algorithm /// </summary> public interface IStringSearchAlgorithm { #region Methods & Properties /// <summary> /// Ignore case of letters /// </summary> bool IgnoreCase { get; set; } /// <summary> /// List of keywords to search for /// </summary> string[] Keywords { get; set; } /// <summary> /// Searches passed text and returns all occurrences of any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>Array of occurrences</returns> StringSearchResult[] FindAll(string text); /// <summary> /// Searches passed text and returns first occurrence of any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns> StringSearchResult FindFirst(string text); /// <summary> /// Searches passed text and returns true if text contains any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>True when text contains any keyword</returns> bool ContainsAny(string text); #endregion } /// <summary> /// Structure containing results of search /// (keyword and position in original text) /// </summary> public struct StringSearchResult { #region Members private int _index; private string _keyword; /// <summary> /// Initialize string search result /// </summary> /// <param name="index">Index in text</param> /// <param name="keyword">Found keyword</param> public StringSearchResult(int index, string keyword) { _index = index; _keyword = keyword; } /// <summary> /// Returns index of found keyword in original text /// </summary> public int Index { get { return _index; } } /// <summary> /// Returns keyword found by this result /// </summary> public string Keyword { get { return _keyword; } } /// <summary> /// Returns empty search result /// </summary> public static StringSearchResult Empty { get { return new StringSearchResult(-1, ""); } } #endregion } /// <summary> /// Class for searching string for one or multiple /// keywords using efficient Aho-Corasick search algorithm /// </summary> public class StringSearch : IStringSearchAlgorithm { #region Objects /// <summary> /// Tree node representing character and its /// transition and failure function /// </summary> class TreeNode { #region Constructor & Methods /// <summary> /// Initialize tree node with specified character /// </summary> /// <param name="parent">Parent node</param> /// <param name="c">Character</param> public TreeNode(TreeNode parent, char c) { _char = c; _parent = parent; _results = new ArrayList(); _resultsAr = new string[] { }; _transitionsAr = new TreeNode[] { }; _transHash = new Hashtable(); } /// <summary> /// Adds pattern ending in this node /// </summary> /// <param name="result">Pattern</param> public void AddResult(string result) { if (_results.Contains(result)) return; _results.Add(result); _resultsAr = (string[])_results.ToArray(typeof(string)); } /// <summary> /// Adds trabsition node /// </summary> /// <param name="node">Node</param> //public void AddTransition(TreeNode node) //{ // AddTransition(node, false); //} /// <summary> /// Adds trabsition node /// </summary> /// <param name="node">Node</param> /// <param name="ignoreCase">Ignore case of letters</param> public void AddTransition(TreeNode node, bool ignoreCase) { if (ignoreCase) _transHash.Add(char.ToLower(node.Char), node); else _transHash.Add(node.Char, node); TreeNode[] ar = new TreeNode[_transHash.Values.Count]; _transHash.Values.CopyTo(ar, 0); _transitionsAr = ar; } /// <summary> /// Returns transition to specified character (if exists) /// </summary> /// <param name="c">Character</param> /// <param name="ignoreCase">Ignore case of letters</param> /// <returns>Returns TreeNode or null</returns> public TreeNode GetTransition(char c, bool ignoreCase) { if (ignoreCase) return (TreeNode)_transHash[char.ToLower(c)]; return (TreeNode)_transHash[c]; } /// <summary> /// Returns true if node contains transition to specified character /// </summary> /// <param name="c">Character</param> /// <param name="ignoreCase">Ignore case of letters</param> /// <returns>True if transition exists</returns> public bool ContainsTransition(char c, bool ignoreCase) { return GetTransition(c, ignoreCase) != null; } #endregion #region Properties private char _char; private TreeNode _parent; private TreeNode _failure; private ArrayList _results; private TreeNode[] _transitionsAr; private string[] _resultsAr; private Hashtable _transHash; /// <summary> /// Character /// </summary> public char Char { get { return _char; } } /// <summary> /// Parent tree node /// </summary> public TreeNode Parent { get { return _parent; } } /// <summary> /// Failure function - descendant node /// </summary> public TreeNode Failure { get { return _failure; } set { _failure = value; } } /// <summary> /// Transition function - list of descendant nodes /// </summary> public TreeNode[] Transitions { get { return _transitionsAr; } } /// <summary> /// Returns list of patterns ending by this letter /// </summary> public string[] Results { get { return _resultsAr; } } #endregion } #endregion #region Local fields /// <summary> /// Root of keyword tree /// </summary> private TreeNode _root; /// <summary> /// Keywords to search for /// </summary> private string[] _keywords; #endregion #region Initialization /// <summary> /// Initialize search algorithm (Build keyword tree) /// </summary> /// <param name="keywords">Keywords to search for</param> /// <param name="ignoreCase">Ignore case of letters (the default is false)</param> public StringSearch(string[] keywords, bool ignoreCase) : this(keywords) { IgnoreCase = ignoreCase; } /// <summary> /// Initialize search algorithm (Build keyword tree) /// </summary> /// <param name="keywords">Keywords to search for</param> public StringSearch(string[] keywords) { Keywords = keywords; } /// <summary> /// Initialize search algorithm with no keywords /// (Use Keywords property) /// </summary> public StringSearch() { } #endregion #region Implementation /// <summary> /// Build tree from specified keywords /// </summary> void BuildTree() { // Build keyword tree and transition function _root = new TreeNode(null, ' '); foreach (string p in _keywords) { // add pattern to tree TreeNode nd = _root; foreach (char c in p) { TreeNode ndNew = null; foreach (TreeNode trans in nd.Transitions) { if (this.IgnoreCase) { if (char.ToLower(trans.Char) == char.ToLower(c)) { ndNew = trans; break; } } else { if (trans.Char == c) { ndNew = trans; break; } } } if (ndNew == null) { ndNew = new TreeNode(nd, c); nd.AddTransition(ndNew, this.IgnoreCase); } nd = ndNew; } nd.AddResult(p); } // Find failure functions ArrayList nodes = new ArrayList(); // level 1 nodes - fail to root node foreach (TreeNode nd in _root.Transitions) { nd.Failure = _root; foreach (TreeNode trans in nd.Transitions) nodes.Add(trans); } // other nodes - using BFS while (nodes.Count != 0) { ArrayList newNodes = new ArrayList(); foreach (TreeNode nd in nodes) { TreeNode r = nd.Parent.Failure; char c = nd.Char; while (r != null && !r.ContainsTransition(c, this.IgnoreCase)) r = r.Failure; if (r == null) nd.Failure = _root; else { nd.Failure = r.GetTransition(c, this.IgnoreCase); foreach (string result in nd.Failure.Results) nd.AddResult(result); } // add child nodes to BFS list foreach (TreeNode child in nd.Transitions) newNodes.Add(child); } nodes = newNodes; } _root.Failure = _root; } #endregion #region Methods & Properties /// <summary> /// Ignore case of letters /// </summary> public bool IgnoreCase { get; set; } /// <summary> /// Keywords to search for (setting this property is slow, because /// it requieres rebuilding of keyword tree) /// </summary> public string[] Keywords { get { return _keywords; } set { _keywords = value; BuildTree(); } } /// <summary> /// Searches passed text and returns all occurrences of any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>Array of occurrences</returns> public StringSearchResult[] FindAll(string text) { ArrayList ret = new ArrayList(); TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; foreach (string found in ptr.Results) ret.Add(new StringSearchResult(index - found.Length + 1, found)); index++; } return (StringSearchResult[])ret.ToArray(typeof(StringSearchResult)); } /// <summary> /// Searches passed text and returns first occurrence of any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns> public StringSearchResult FindFirst(string text) { ArrayList ret = new ArrayList(); TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; foreach (string found in ptr.Results) return new StringSearchResult(index - found.Length + 1, found); index++; } return StringSearchResult.Empty; } /// <summary> /// Searches passed text and returns true if text contains any keyword /// </summary> /// <param name="text">Text to search</param> /// <returns>True when text contains any keyword</returns> public bool ContainsAny(string text) { TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; if (ptr.Results.Length > 0) return true; index++; } return false; } #endregion } }

示例下载页面:http://www.uushare.com/user/m2nlight/file/2722093

StringSearch.7z
StringSearch.7z
类型:7Z 压缩文件
大小:32.5 KB
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值