后缀树(Suffix Tree)的文本匹配算法

最新推荐文章于 2024-07-15 09:03:01 发布

ljsspace

最新推荐文章于 2024-07-15 09:03:01 发布

阅读量6.8k

点赞数

分类专栏：数据结构和算法 Stringology 文章标签：算法 tree string insert exception terminal

本文链接：https://blog.csdn.net/ljsspace/article/details/6571467

版权

数据结构和算法同时被 2 个专栏收录

41 篇文章 0 订阅

订阅专栏

Stringology

18 篇文章 0 订阅

订阅专栏

后缀树(Suffix Tree)是一种特殊的Trie，它的用途非常广泛，其中一个主要的应用是作文本匹配，也像KMP等算法一样，它也是空间换时间的一个典范。利用 Suffix Tree做文本匹配与其他的模式匹配算法比如KMP和Boyer-Moore算法的主要区别是，后缀树文本匹配算法是对文本T做预处理，而KMP算法是对模式串P做预处理。因此后缀树常用于文本静态，而模式串动态的场合；而KMP等算法常用于文本动态，模式串静态的场合。设T的长度为n，P的长度为m，一般情况下m<n。在预处理中，用Suffix Tree匹配的复杂度为O(n)，而KMP和Boyer-Moore的复杂度为O(m)。可是预处理结束后，KMP等算法的复杂度为O(n)，后缀树匹配算法的复杂度只有O(m)，这是令人惊叹的效率！

本文后缀树用蛮力法构建，跟构建前缀树Patricia Trie类似。后缀树用Patricia Trie压缩存储的好处是，Patricia Trie存储空间只与单词的个数相关(因为有了压缩)，而普通的Trie的存储空间与单词的总长度相关(因为没有压缩)。一个文本text的所有后缀总长度为n + (n-1) + ... + 1 = n(n+1)/2，如果用普通的Trie存储后缀树，所需空间为O(n^2)；而用Patricia Trie压缩之后的为O(n)，这里n为后缀的个数。没有使用压缩存储的后缀树叫做Suffix Trie，而不是Suffix Tree。一般情况下，使用压缩方式存储后缀树是最基本的要求。

在下面的实现中，利用Patricia Trie来构造后缀树，每一个结点除了存储Patricia Trie的key值之外，还存储了该结点key值在文本text中出现的最小下标值minStartIndex，这样便于匹配时输出成功匹配的位置。另外，出于实际应用考虑，后缀树在叶子结点中不必要存储value。除了没有delete操作（文本是静态的，不需要修改）之外，建树操作(insert) 和查询匹配(find)操作跟Patricia Trie的实现差别不大。

实现：

import java.util.LinkedList;import java.util.List; /** * * Suffix-Tree String Pattern Matching(Building tree using brute-force) * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-06-27 * */public class SuffixTree {private class SuffixNode {private String key; private List<SuffixNode> children = new LinkedList<SuffixNode>(); //use "#" for terminal char private boolean terminal; private int minStartIndex; public SuffixNode(){ this.key = ""; minStartIndex = -1; } public SuffixNode(String key){ this.key = key; } public String toString(){ return this.key + "[" + this.minStartIndex + "]" + (this.terminal?"#":"") + "(" + children.size() +")"; } }private SuffixNode root;private String text;public SuffixTree(String text){this.text = text;}//return the start index of the matched substring;//return -1 if no match is foundpublic int find(String pattern){if(pattern == null || pattern.length() == 0) return -1;if(root==null){return -1;}else{return find(root,pattern);}}private int find(SuffixNode currNode,String pattern) {for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);//use min(child.key.length, pattern.length)int len = child.key.length()<pattern.length()?child.key.length():pattern.length();int j = 0;for(;j<len;j++){if(pattern.charAt(j) != child.key.charAt(j)){break;}}if(j==0){//this child doesn't match any character with the new pattern//order suffix-key by lexi-orderif(pattern.charAt(0)<child.key.charAt(0)){//e.g. child="e", pattern="c" (currNode="abc")// abc // / / // e h return -1;}else{//e.g. child="e", pattern="h" (currNode="abc")continue;}}else{//current child's key partially matches with the new pattern; 0<j<=lenif(j==len){if(pattern.length()==child.key.length()){if(child.terminal){//e.g. child="ab", pattern="ab"// ab# // / // f# return child.minStartIndex;}else{//e.g. child="ab", pattern="ab"// ab // / / // e f return child.minStartIndex;}}else if(pattern.length()>child.key.length()){//e.g. child="ab#", pattern="abc"// ab# // / / // a c# String subpattern = pattern.substring(j); //c//recursionint index = find(child,subpattern);if(index==-1){return -1;}else{return index-child.key.length();}}else{ //pattern.length()<child.key.length()//e.g. child="abc", pattern="ab"// abc // / / // e f return child.minStartIndex;}}else{//0<j<len//e.g. child="abc", pattern="abd"// abc // / / // e f return -1;}}}return -1;}private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{boolean done = false;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);//use min(child.key.length, key.length)int len = child.key.length()<key.length()?child.key.length():key.length();int j = 0;for(;j<len;j++){if(key.charAt(j) != child.key.charAt(j)){break;}}if(j==0){//this child doesn't match any character with the new key//order keys by lexi-orderif(key.charAt(0)<child.key.charAt(0)){//e.g. child="e" (currNode="abc")// abc abc// / / =========> / | /// e f insert "c" c# e fSuffixNode node = new SuffixNode(key);currNode.children.add(i,node);node.terminal = true;node.minStartIndex = startIndex;done = true;break;}else{ //key.charAt(0)>child.key.charAt(0)//don't forget to add the largest new key after iterating all childrencontinue;}}else{//current child's key partially matches with the new key; 0<j<=lenif(j==len){if(key.length()==child.key.length()){if(child.terminal){throw new Exception("Duplicate Key is found when insertion!");}else{//e.g. child="ab"// ab ab#// / / =========> / /// e f insert "ab" e fchild.terminal = true;if(child.minStartIndex>startIndex)child.minStartIndex = startIndex;}}else if(key.length()>child.key.length()){//e.g. child="ab#"// ab# ab#// / / ==========> / | / // e f insert "abc" c# e fif(child.minStartIndex>startIndex)child.minStartIndex = startIndex;String subkey = key.substring(j);//recursioninsert(child,subkey,startIndex+j);}else{ //key.length()<child.key.length()//e.g. child="abc#"// abc# ab#// / / =========> / // e f insert "ab" c# // / /// e fString childSubkey = child.key.substring(j); //cSuffixNode subChildNode = new SuffixNode(childSubkey);subChildNode.terminal = child.terminal;subChildNode.children = child.children; //inherited from parentsubChildNode.minStartIndex = child.minStartIndex+j;child.key = key; //abchild.terminal = true; //ab#if(child.minStartIndex>startIndex)child.minStartIndex = startIndex;child.children = new LinkedList<SuffixNode>();child.children.add(subChildNode);}}else{//0<j<len//e.g. child="abc#"// abc# ab// / / ==========> / /// e f insert "abd" c# d# // / /// e f//split at jString childSubkey = child.key.substring(j); //cString subkey = key.substring(j); //dSuffixNode subChildNode = new SuffixNode(childSubkey);subChildNode.terminal = child.terminal;subChildNode.children = child.children; //inherited from parentsubChildNode.minStartIndex = child.minStartIndex+j;//update child's keychild.key = child.key.substring(0,j);if(child.minStartIndex>startIndex)child.minStartIndex = startIndex;//child is not terminal now due to split, it is inherited by subChildNodechild.terminal = false;//Note: no need to merge subChildNodeSuffixNode node = new SuffixNode(subkey);node.terminal = true;node.minStartIndex = startIndex+j;child.children = new LinkedList<SuffixNode>();if(subkey.charAt(0)<childSubkey.charAt(0)){child.children.add(node);child.children.add(subChildNode);}else{child.children.add(subChildNode);child.children.add(node);}}done = true;break;}}if(!done){SuffixNode node = new SuffixNode(key);node.terminal = true;node.minStartIndex = startIndex;currNode.children.add(node);}}public void insert(String suffix,int startIndex) throws Exception{if(suffix == null || suffix.length() == 0) return;if(root==null){root = new SuffixNode();}insert(root,suffix,startIndex);}//build a suffix-tree for a string of textpublic void buildSuffixTree() throws Exception{for(int i=0;i<text.length();i++){this.insert(text.substring(i), i);}}//for test purpose onlypublic void printTree(){this.print(0, this.root);}private void print(int level, SuffixNode node){for (int i = 0; i < level; i++) { System.out.format(" "); }System.out.format("|"); for (int i = 0; i < level; i++) { System.out.format("-"); } if (node.terminal) System.out.format("%s[%s]#%n", node.key,node.minStartIndex); else System.out.format("%s[%s]%n", node.key,node.minStartIndex); for (SuffixNode child : node.children) { print(level + 1, child); }}public void testFind(String pattern){int index = this.find(pattern);if(index != -1)System.out.format("Found pattern /"%s/" at: %s%n",pattern,index);elseSystem.out.format("Found no such pattern: /"%s/"%n",pattern);}public static void main(String[] args) throws Exception {//test suffix-treeSystem.out.println("****************************");String text = "minimize";SuffixTree strie = new SuffixTree(text);strie.buildSuffixTree();strie.printTree();System.out.println("****************************");text = "mississippi";strie = new SuffixTree(text);strie.buildSuffixTree();strie.printTree();String pattern = "iss";strie.testFind(pattern);pattern = "ip";strie.testFind(pattern);pattern = "pi";strie.testFind(pattern);pattern = "miss";strie.testFind(pattern);pattern = "tt";strie.testFind(pattern);pattern = "si";strie.testFind(pattern);pattern = "ssi";strie.testFind(pattern);pattern = "sissippi";strie.testFind(pattern);pattern = "ssippi";strie.testFind(pattern);System.out.println("****************************");text = "After a long text, here's a needle ZZZZZ"; pattern = "ZZZZZ"; strie = new SuffixTree(text);strie.buildSuffixTree();//strie.printTree();strie.testFind(pattern);System.out.println("****************************");text = "The quick brown fox jumps over the lazy dog."; pattern = "lazy"; strie = new SuffixTree(text);strie.buildSuffixTree();//strie.printTree();strie.testFind(pattern);System.out.println("****************************");text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna..."; pattern = "tempor"; strie = new SuffixTree(text);strie.buildSuffixTree();//strie.printTree();strie.testFind(pattern);System.out.println("****************************");text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA"; pattern = "GCAGAGAG"; strie = new SuffixTree(text);strie.buildSuffixTree();//strie.printTree();strie.testFind(pattern);}}

测试输出：

****************************|[-1] |-e[7]# |-i[1] |--mize[4]# |--nimize[2]# |--ze[6]# |-mi[0] |--nimize[2]# |--ze[6]# |-nimize[2]# |-ze[6]#****************************|[-1] |-i[1]# |--ppi[8]# |--ssi[2] |---ppi[8]# |---ssippi[5]# |-mississippi[0]# |-p[8] |--i[10]# |--pi[9]# |-s[2] |--i[4] |---ppi[8]# |---ssippi[5]# |--si[3] |---ppi[8]# |---ssippi[5]#Found pattern "iss" at: 1Found pattern "ip" at: 7Found pattern "pi" at: 9Found pattern "miss" at: 0Found no such pattern: "tt"Found pattern "si" at: 3Found pattern "ssi" at: 2Found pattern "sissippi" at: 3Found pattern "ssippi" at: 5****************************Found pattern "ZZZZZ" at: 35****************************Found pattern "lazy" at: 35****************************Found pattern "tempor" at: 73****************************Found pattern "GCAGAGAG" at: 23