后缀树(Suffix Tree)的文本匹配算法 后缀树(Suffix Tree)是一种特殊的Trie,它的用途非常广泛,其中一个主要的应用是作文本匹配,也像KMP等算法一样,它也是空间换时间的一个典范。利用 Suffix Tree做文本匹配与其他的模式匹配算法比如KMP和Boyer-Moore算法的主要区别是,后缀树文本匹配算法是对文本T做预处理,而KMP算法是对模式串P做预处理。因此后缀树常用于文本静态,而模式串动态的场合;而KMP等算法常用于文本动态,模式串静态的场合。设T的长度为n,P的长度为m,一般情况下m<n。在预处理中,用Suffix Tree匹配的复杂度为O(n),而KMP和Boyer-Moore的复杂度为O(m)。可是预处理结束后,KMP等算法的复杂度为O(n),后缀树匹配算法的复杂度只有O(m),这是令人惊叹的效率!
本文后缀树用蛮力法构建,跟构建前缀树Patricia Trie类似。后缀树用Patricia Trie压缩存储的好处是,Patricia Trie存储空间只与单词的个数相关(因为有了压缩),而普通的Trie的存储空间与单词的总长度相关(因为没有压缩)。一个文本text的所有后缀总长度为n + (n-1) + ... + 1 = n(n+1)/2,如果用普通的Trie存储后缀树,所需空间为O(n^2);而用Patricia Trie压缩之后的为O(n),这里n为后缀的个数。没有使用压缩存储的后缀树叫做Suffix Trie,而不是Suffix Tree。一般情况下,使用压缩方式存储后缀树是最基本的要求。
在下面的实现中,利用Patricia Trie来构造后缀树,每一个结点除了存储Patricia Trie的key值之外,还存储了该结点key值在文本text中出现的最小下标值minStartIndex,这样便于匹配时输出成功匹配的位置。另外,出于实际应用考虑,后缀树在叶子结点中不必要存储value。除了没有delete操作(文本是静态的,不需要修改)之外,建树操作(insert) 和查询匹配(find)操作跟Patricia Trie的实现差别不大。
实现:
view sourceprint?import java.util.LinkedList;
import java.util.List;
/**
*
* Suffix-Tree String Pattern Matching(Building tree using brute-force)
*
* Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
* Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)
*
* @author ljs
* 2011-06-27
*
*/
public class SuffixTree {
private class SuffixNode {
private String key;
private List<SuffixNode> children = new LinkedList<SuffixNode>();
//use "#" for terminal char
private boolean terminal;
private int minStartIndex;
public SuffixNode(){
this.key = "";
minStartIndex = -1;
}
public SuffixNode(String key){
this.key = key;
}
public String toString(){
return this.key + "[" + this.minStartIndex + "]" + (this.terminal?"#":"") + "(" + children.size() +")";
}
}
private SuffixNode root;
private String text;
public SuffixTree(String text){
this.text = text;
}
//return the start index of the matched substring;
//return -1 if no match is found
public int find(String pattern){
if(pattern == null || pattern.length() == 0)
return -1;
if(root==null){
return -1;
}else{
return find(root,pattern);
}
}
private int find(SuffixNode currNode,String pattern) {
for(int i=0;i<currNode.children.size();i++){
SuffixNode child = currNode.children.get(i);
//use min(child.key.length, pattern.length)
int len = child.key.length()<pattern.length()?child.key.length():
pattern.length();
int j = 0;
for(;j<len;j++){
if(pattern.charAt(j) != child.key.charAt(j)){
break;
}
}
if(j==0){//this child doesn't match any character with the new pattern
//order suffix-key by lexi-order
if(pattern.charAt(0)<child.key.charAt(0)){
//e.g. child="e", pattern="c" (currNode="abc")
// abc
// / \
// e h
return -1;
}else{
//e.g. child="e", pattern="h" (currNode="abc")
continue;
}
}else{//current child's key partially matches with the new pattern; 0<j<=len
if(j==len){
if(pattern.length()==child.key.length()){
if(child.terminal){
//e.g. child="ab", pattern="ab"
// ab#
// \
// f#
return child.minStartIndex;
}else{
//e.g. child="ab", pattern="ab"
// ab
// / \
// e f
return child.minStartIndex;
}
}else if(pattern.length()>child.key.length()){
//e.g. child="ab#", pattern="abc"
// ab#
// / \
// a c#
String subpattern = pattern.substring(j); //c
//recursion
int index = find(child,subpattern);
if(index==-1){
return -1;
}else{
return index-child.key.length();
}
}else{ //pattern.length()<child.key.length()
//e.g. child="abc", pattern="ab"
// abc
// / \
// e f
return child.minStartIndex;
}
}else{//0<j<len
//e.g. child="abc", pattern="abd"
// abc
// / \
// e f
return -1;
}
}
}
return -1;
}
private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{
boolean done = false;
for(int i=0;i<currNode.children.size();i++){
SuffixNode child = currNode.children.get(i);
//use min(child.key.length, key.length)
int len = child.key.length()<key.length()?child.key.length():
key.length();
int j = 0;
for(;j<len;j++){
if(key.charAt(j) != child.key.charAt(j)){
break;
}
}
if(j==0){//this child doesn't match any character with the new key
//order keys by lexi-order
if(key.charAt(0)<child.key.charAt(0)){
//e.g. child="e" (currNode="abc")
// abc abc
// / \ =========> / | \
// e f insert "c" c# e f
SuffixNode node = new SuffixNode(key);
currNode.children.add(i,node);
node.terminal = true;
node.minStartIndex = startIndex;
done = true;
break;
}else{ //key.charAt(0)>child.key.charAt(0)
//don't forget to add the largest new key after iterating all children
continue;
}
}else{//current child's key partially matches with the new key; 0<j<=len
if(j==len){
if(key.length()==child.key.length()){
if(child.terminal){
throw new Exception("Duplicate Key is found when insertion!");
}else{
//e.g. child="ab"
// ab ab#
// / \ =========> / \
// e f insert "ab" e f
child.terminal = true;
if(child.minStartIndex>startIndex)
child.minStartIndex = startIndex;
}
}else if(key.length()>child.key.length()){
//e.g. child="ab#"
// ab# ab#
// / \ ==========> / | \
// e f insert "abc" c# e f
if(child.minStartIndex>startIndex)
child.minStartIndex = startIndex;
String subkey = key.substring(j);
//recursion
insert(child,subkey,startIndex+j);
}else{ //key.length()<child.key.length()
//e.g. child="abc#"
// abc# ab#
// / \ =========> /
// e f insert "ab" c#
// / \
// e f
String childSubkey = child.key.substring(j); //c
SuffixNode subChildNode = new SuffixNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.children = child.children; //inherited from parent
subChildNode.minStartIndex = child.minStartIndex+j;
child.key = key; //ab
child.terminal = true; //ab#
if(child.minStartIndex>startIndex)
child.minStartIndex = startIndex;
child.children = new LinkedList<SuffixNode>();
child.children.add(subChildNode);
}
}else{//0<j<len
//e.g. child="abc#"
// abc# ab
// / \ ==========> / \
// e f insert "abd" c# d#
// / \
// e f
//split at j
String childSubkey = child.key.substring(j); //c
String subkey = key.substring(j); //d
SuffixNode subChildNode = new SuffixNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.children = child.children; //inherited from parent
subChildNode.minStartIndex = child.minStartIndex+j;
//update child's key
child.key = child.key.substring(0,j);
if(child.minStartIndex>startIndex)
child.minStartIndex = startIndex;
//child is not terminal now due to split, it is inherited by subChildNode
child.terminal = false;
//Note: no need to merge subChildNode
SuffixNode node = new SuffixNode(subkey);
node.terminal = true;
node.minStartIndex = startIndex+j;
child.children = new LinkedList<SuffixNode>();
if(subkey.charAt(0)<childSubkey.charAt(0)){
child.children.add(node);
child.children.add(subChildNode);
}else{
child.children.add(subChildNode);
child.children.add(node);
}
}
done = true;
break;
}
}
if(!done){
SuffixNode node = new SuffixNode(key);
node.terminal = true;
node.minStartIndex = startIndex;
currNode.children.add(node);
}
}
public void insert(String suffix,int startIndex) throws Exception{
if(suffix == null || suffix.length() == 0) return;
if(root==null){
root = new SuffixNode();
}
insert(root,suffix,startIndex);
}
//build a suffix-tree for a string of text
public void buildSuffixTree() throws Exception{
for(int i=0;i<text.length();i++){
this.insert(text.substring(i), i);
}
}
//for test purpose only
public void printTree(){
this.print(0, this.root);
}
private void print(int level, SuffixNode node){
for (int i = 0; i < level; i++) {
System.out.format(" ");
}
System.out.format("|");
for (int i = 0; i < level; i++) {
System.out.format("-");
}
if (node.terminal)
System.out.format("%s[%s]#%n", node.key,node.minStartIndex);
else
System.out.format("%s[%s]%n", node.key,node.minStartIndex);
for (SuffixNode child : node.children) {
print(level + 1, child);
}
}
public void testFind(String pattern){
int index = this.find(pattern);
if(index != -1)
System.out.format("Found pattern \"%s\" at: %s%n",pattern,index);
else
System.out.format("Found no such pattern: \"%s\"%n",pattern);
}
public static void main(String[] args) throws Exception {
//test suffix-tree
System.out.println("****************************");
String text = "minimize";
SuffixTree strie = new SuffixTree(text);
strie.buildSuffixTree();
strie.printTree();
System.out.println("****************************");
text = "mississippi";
strie = new SuffixTree(text);
strie.buildSuffixTree();
strie.printTree();
String pattern = "iss";
strie.testFind(pattern);
pattern = "ip";
strie.testFind(pattern);
pattern = "pi";
strie.testFind(pattern);
pattern = "miss";
strie.testFind(pattern);
pattern = "tt";
strie.testFind(pattern);
pattern = "si";
strie.testFind(pattern);
pattern = "ssi";
strie.testFind(pattern);
pattern = "sissippi";
strie.testFind(pattern);
pattern = "ssippi";
strie.testFind(pattern);
System.out.println("****************************");
text = "After a long text, here's a needle ZZZZZ";
pattern = "ZZZZZ";
strie = new SuffixTree(text);
strie.buildSuffixTree();
//strie.printTree();
strie.testFind(pattern);
System.out.println("****************************");
text = "The quick brown fox jumps over the lazy dog.";
pattern = "lazy";
strie = new SuffixTree(text);
strie.buildSuffixTree();
//strie.printTree();
strie.testFind(pattern);
System.out.println("****************************");
text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...";
pattern = "tempor";
strie = new SuffixTree(text);
strie.buildSuffixTree();
//strie.printTree();
strie.testFind(pattern);
System.out.println("****************************");
text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA";
pattern = "GCAGAGAG";
strie = new SuffixTree(text);
strie.buildSuffixTree();
//strie.printTree();
strie.testFind(pattern);
}
}
测试输出:
view sourceprint?****************************
|[-1]
|-e[7]#
|-i[1]
|--mize[4]#
|--nimize[2]#
|--ze[6]#
|-mi[0]
|--nimize[2]#
|--ze[6]#
|-nimize[2]#
|-ze[6]#
****************************
|[-1]
|-i[1]#
|--ppi[8]#
|--ssi[2]
|---ppi[8]#
|---ssippi[5]#
|-mississippi[0]#
|-p[8]
|--i[10]#
|--pi[9]#
|-s[2]
|--i[4]
|---ppi[8]#
|---ssippi[5]#
|--si[3]
|---ppi[8]#
|---ssippi[5]#
Found pattern "iss" at: 1
Found pattern "ip" at: 7
Found pattern "pi" at: 9
Found pattern "miss" at: 0
Found no such pattern: "tt"
Found pattern "si" at: 3
Found pattern "ssi" at: 2
Found pattern "sissippi" at: 3
Found pattern "ssippi" at: 5
****************************
Found pattern "ZZZZZ" at: 35
****************************
Found pattern "lazy" at: 35
****************************
Found pattern "tempor" at: 73
****************************
Found pattern "GCAGAGAG" at: 23